In [71]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [88]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')
# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()

In [89]:
# Splitsen in features en targets
target = 'Sph-Far-R'

y = dataset[target].values
X = dataset.drop(target, axis=1)

### Eerst proberen per feat een model te maken

*bij verandering van target gewoon onderstaande var veranderen dan zou de rest moeten updaten*

In [90]:
# Splitsen in train en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42069)

In [36]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train,y_train)
print('r2 score = ', lregModel.score(X_test,y_test))

r2 score =  0.8529754825718101


In [37]:
# test hogere orde features
graad = 2

polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print('dimensie van X_train_poly: ', X_train_poly.shape)


# L2 regularisatie via Ridge regression
lregPolyModel = Ridge(alpha=0.5, tol=0.0001, fit_intercept=True)
lregPolyModel.fit(X_train_poly,y_train)

print('R2 score op training set via L2: ', lregPolyModel.score(X_train_poly,y_train))
print('R2 score op test set via L2: ', lregPolyModel.score(X_test_poly,y_test))
print('Intercept score op test set via L2: ', lregPolyModel.intercept_)

dimensie van X_train_poly:  (41911, 136)
R2 score op training set via L2:  0.9139076623215756
R2 score op test set via L2:  0.8632385569982404
Intercept score op test set via L2:  0.08676219765320664


In [38]:
# Aanmaken van de hogere orde features
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train,y_train)

print(RFR_model.score(X_test,y_test))

0.9326146370763045


### Sliding window

In [39]:
# # Sliding window
# window_size = 2
# X_window = []
# y_window = []

# for w in range(window_size, X.shape[0]):
#     X_window.append(X[w-window_size:w,:].reshape(1,-1))
#     y_window.append(y[w-1])

# X_window = np.asarray(X_window)
# X_window = np.squeeze(X_window, axis=1)
# y_window = np.asarray(y_window)

# X_train, X_test, y_train, y_test = train_test_split(X_window, y_window, test_size=0.2, random_state=0 )

# # Aanmaken van de hogere orde features
# RFR_model.fit(X_train,y_train)
# RFR_model.score(X_test,y_test)

### Rijen aan elkaar plakken

#### met var prefAmountRecords kan je kiezen welke minimum records je wil, naargelang deze grafiek is de keuze te maken:
![Graph](https://imgur.com/82t9CSH)



In [91]:
# Drop waardes die minder dan n* records hebben
prefAmountRecords = 3

preDrop = dataset.shape[0]
dataset = dataset.groupby('ID').filter(lambda x: len(x) >= prefAmountRecords)
dataset = dataset.groupby('ID').head(prefAmountRecords)
postDrop = dataset.shape[0]

print(f'Dropped {preDrop-postDrop} records of {preDrop}, {math.floor((postDrop/preDrop)*100)}% remaining')

Dropped 32226 records of 52389, 38% remaining


In [93]:
print(f"merging {prefAmountRecords} records per ID for a total of {len(dataset.groupby('ID'))} unique ID's. Current shape: {dataset.shape[0]}:{dataset.shape[1]}")
mergeSet = dataset.copy()
df = pd.DataFrame(data=dataset['ID'])
for i in range(prefAmountRecords):
    print(i)
    df = df.merge(right=mergeSet, on=['ID'], suffixes=['', f'_{i}'])
    print(df.head(5))

print(f'Shape after merge: {dataset.shape[0]}:{dataset.shape[1]}')

print(dataset.columns)
dataset = dataset.sort_values(by=['Measurement_Age'])
#dataset = dataset.loc[:,~df.columns.duplicated()] # https://stackoverflow.com/questions/14984119 

dataset.head(10)

merging 3 records per ID for a total of 6721 unique ID's. Current shape: 20163:16
0
         ID  Sex  Measurement_Age   Add  Sph-Far-R  Cyl-Far-R  Axis-Far-R  \
0  292138.0  0.0             47.0  0.00       0.00        0.5       125.0   
1  292138.0  0.0             47.0  0.00       0.00        0.5       125.0   
2  292138.0  0.0           1828.0  1.75       0.25        0.5       125.0   
3  292138.0  0.0             47.0  0.00       0.00        0.5       125.0   
4  292138.0  0.0             47.0  0.00       0.00        0.5       125.0   

   Sph-Close-R  Cyl-Close-R  Axis-Close-R  Sph-Far-L  Cyl-Far-L  Axis-Far-L  \
0         0.00         0.00           0.0      -3.75       0.75        75.0   
1         0.00         0.00           0.0      -3.75       0.75        75.0   
2         1.75         0.75         125.0      -3.75       0.75        75.0   
3         0.00         0.00           0.0      -3.75       0.75        75.0   
4         0.00         0.00           0.0      -3.75      

Unnamed: 0,ID,Sex,Measurement_Age,Add,Sph-Far-R,Cyl-Far-R,Axis-Far-R,Sph-Close-R,Cyl-Close-R,Axis-Close-R,Sph-Far-L,Cyl-Far-L,Axis-Far-L,Sph-Close-L,Cyl-Close-L,Axis-Close-L
21985,292138.0,0.0,47.0,0.0,0.0,0.5,125.0,0.0,0.0,0.0,-3.75,0.75,75.0,0.0,0.0,0.0
21986,292138.0,0.0,47.0,0.0,0.0,0.5,125.0,0.0,0.0,0.0,-3.75,0.75,75.0,0.0,0.0,0.0
28303,423449.0,0.0,187.0,0.0,0.0,0.0,0.0,2.75,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
50073,720457.0,0.0,448.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,-0.5,2.0,20.0,0.0,0.0,0.0
37780,617287.0,1.0,714.0,2.5,1.75,0.5,0.0,4.25,0.5,0.0,1.25,0.5,180.0,3.75,0.5,180.0
37782,617287.0,1.0,714.0,2.5,1.75,0.0,0.0,4.25,0.5,0.0,1.25,0.5,180.0,3.75,0.0,180.0
37781,617287.0,1.0,714.0,2.5,1.75,0.0,0.0,4.25,0.5,0.0,1.25,0.5,180.0,3.75,0.0,180.0
26557,798001.0,1.0,866.0,2.25,0.5,0.5,150.0,2.5,0.75,150.0,0.5,0.75,10.0,3.0,0.5,10.0
16352,453314.0,1.0,916.0,0.0,8.75,2.0,165.0,0.0,0.0,0.0,8.5,1.5,15.0,0.0,0.0,0.0
28302,423449.0,0.0,922.0,1.75,1.5,0.0,0.0,3.25,0.0,0.0,1.75,0.0,0.0,3.5,0.0,0.0


In [42]:
# Splitsen in features en targets
y = dataset[f'{target}_{prefAmountRecords}'].values
x = dataset[col for col in dataset.columns if not f'_{prefAmountRecords}' in col]

SyntaxError: invalid syntax (<ipython-input-42-3125946fdd1d>, line 3)

In [43]:
# Splitsen in training set en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [44]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train,y_train)
print('r2 score = ', lregModel.score(X_test,y_test))

r2 score =  0.9091202666473615


In [45]:
# hogere orde features
graad = 2

polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print('dimensie van X_train_poly: ', X_train_poly.shape)


# L2 regularisatie via Ridge regression
lregPolyModel = Ridge(alpha=0.5, tol=0.0001, fit_intercept=True)
lregPolyModel.fit(X_train_poly,y_train)

print('R2 score op training set via L2: ', lregPolyModel.score(X_train_poly,y_train))
print('R2 score op test set via L2: ', lregPolyModel.score(X_test_poly,y_test))
print('Intercept score op test set via L2: ', lregPolyModel.intercept_)

dimensie van X_train_poly:  (41911, 136)
R2 score op training set via L2:  0.8983340985500985
R2 score op test set via L2:  0.9251155567012139
Intercept score op test set via L2:  0.05465077170281313
