In [74]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [75]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')
# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()
dataset.head(5)

Unnamed: 0,ID,Sex,Measurement_Age,Add,Sph-Far-R,Cyl-Far-R,Axis-Far-R,Sph-Close-R,Cyl-Close-R,Axis-Close-R,Sph-Far-L,Cyl-Far-L,Axis-Far-L,Sph-Close-L,Cyl-Close-L,Axis-Close-L
0,203795.0,0.0,21118.0,0.0,-1.75,0.5,55.0,-2.25,1.0,55.0,-1.75,1.0,110.0,-1.25,0.5,110.0
1,203795.0,0.0,20245.0,0.0,-1.75,0.5,65.0,0.0,0.0,0.0,-1.25,0.5,110.0,0.0,0.0,0.0
2,203795.0,0.0,18099.0,0.0,-1.5,0.5,65.0,0.0,0.0,0.0,-1.0,0.5,110.0,0.0,0.0,0.0
3,546632.0,1.0,13825.0,0.0,-3.5,1.5,180.0,-3.5,1.5,180.0,-3.0,1.5,180.0,-3.0,1.5,180.0
4,546632.0,1.0,9653.0,0.0,-2.0,0.75,175.0,0.0,0.0,0.0,-2.0,0.75,180.0,0.0,0.0,0.0


In [76]:
#drop rijen waar < n meting van zijn en houd van de overige de top n meest recente waardes
dataset = dataset.groupby('ID').filter(lambda x: len(x) > 1)
dataset = dataset.groupby('ID').head(2)

In [77]:
dataset.head()

Unnamed: 0,ID,Sex,Measurement_Age,Add,Sph-Far-R,Cyl-Far-R,Axis-Far-R,Sph-Close-R,Cyl-Close-R,Axis-Close-R,Sph-Far-L,Cyl-Far-L,Axis-Far-L,Sph-Close-L,Cyl-Close-L,Axis-Close-L
0,203795.0,0.0,21118.0,0.0,-1.75,0.5,55.0,-2.25,1.0,55.0,-1.75,1.0,110.0,-1.25,0.5,110.0
1,203795.0,0.0,20245.0,0.0,-1.75,0.5,65.0,0.0,0.0,0.0,-1.25,0.5,110.0,0.0,0.0,0.0
3,546632.0,1.0,13825.0,0.0,-3.5,1.5,180.0,-3.5,1.5,180.0,-3.0,1.5,180.0,-3.0,1.5,180.0
4,546632.0,1.0,9653.0,0.0,-2.0,0.75,175.0,0.0,0.0,0.0,-2.0,0.75,180.0,0.0,0.0,0.0
7,474866.0,0.0,25627.0,3.0,2.25,0.75,90.0,5.25,1.0,90.0,2.75,1.0,95.0,5.75,0.75,95.0


In [78]:
#2 rijen naast elkaar zetten

dataset = dataset.merge(dataset ,on=['ID', 'Sex'], suffixes=['_x', ''])
dataset = dataset.sort_values(by=['ID', 'Measurement_Age_x'])
dataset = dataset.drop_duplicates(subset=['ID', 'Sex'], keep='first')
dataset.head(20)
# dataset.loc[dataset['ID'] == 471318.0]

Unnamed: 0,ID,Sex,Measurement_Age_x,Add_x,Sph-Far-R_x,Cyl-Far-R_x,Axis-Far-R_x,Sph-Close-R_x,Cyl-Close-R_x,Axis-Close-R_x,Sph-Far-L_x,Cyl-Far-L_x,Axis-Far-L_x,Sph-Close-L_x,Cyl-Close-L_x,Axis-Close-L_x,Measurement_Age,Add,Sph-Far-R,Cyl-Far-R,Axis-Far-R,Sph-Close-R,Cyl-Close-R,Axis-Close-R,Sph-Far-L,Cyl-Far-L,Axis-Far-L,Sph-Close-L,Cyl-Close-L,Axis-Close-L
19520,100104.0,1.0,21234.0,0.0,-5.0,2.5,80.0,0.0,0.0,0.0,-2.25,1.75,105.0,0.0,0.0,0.0,21234.0,0.0,-5.0,2.5,80.0,0.0,0.0,0.0,-2.25,1.75,105.0,0.0,0.0,0.0
35074,100142.0,0.0,28783.0,3.0,-1.0,0.0,0.0,2.0,0.5,0.0,1.5,0.5,0.0,4.5,0.0,0.0,29440.0,3.0,-1.0,0.5,0.0,2.0,0.5,0.0,1.5,0.5,180.0,4.5,0.5,180.0
774,100227.0,0.0,16206.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,16799.0,0.0,1.5,0.0,0.0,1.5,0.0,0.0,1.5,0.0,0.0,1.5,0.0,0.0
40822,100337.0,1.0,9696.0,0.0,-6.5,0.0,0.0,0.0,0.0,0.0,-4.5,0.0,0.0,0.0,0.0,0.0,12046.0,0.0,-6.5,0.0,0.0,-6.5,0.0,0.0,-4.5,0.0,0.0,-4.5,0.0,0.0
28952,100480.0,1.0,14939.0,0.0,-6.0,0.0,0.0,-6.0,0.0,0.0,-5.75,0.0,0.0,-5.75,0.0,0.0,14939.0,0.0,-6.0,0.0,0.0,-6.0,0.0,0.0,-5.75,0.0,0.0,-5.75,0.0,0.0
2478,100592.0,0.0,3935.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,180.0,0.0,0.0,0.0,13217.0,0.0,-2.5,1.75,10.0,-2.5,1.75,10.0,-2.0,1.25,180.0,-2.0,1.25,180.0
32234,100820.0,1.0,16642.0,0.0,0.0,0.0,0.0,1.0,0.25,90.0,0.0,0.0,0.0,1.0,0.25,90.0,16694.0,0.0,-0.25,0.25,90.0,-0.25,0.25,90.0,-0.25,0.25,90.0,-0.25,0.25,90.0
32136,101042.0,0.0,29548.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,29548.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0
5298,101121.0,1.0,17566.0,1.25,-2.75,0.75,10.0,-1.25,0.5,10.0,-2.25,0.5,160.0,-1.25,0.75,160.0,18537.0,2.25,-2.75,0.75,175.0,-1.0,1.25,175.0,-3.25,1.25,165.0,-0.5,0.75,165.0
20532,101146.0,1.0,20355.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,2.75,0.0,0.0,2.75,0.0,0.0,20355.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,2.75,0.0,0.0,2.75,0.0,0.0


In [79]:
#Drop kolom ID
dataset.drop(['ID'],axis=1, inplace=True)


### Linear regression

In [81]:
# Splitsen in features en targets

y = dataset['Sph-Far-R'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# MinMax scaling

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

# Modeloptimalisatie en Hyperparameter tuning

# Aanmaken van de hogere orde features
graad = 2

poly = PolynomialFeatures(graad)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)
print('dimensie van X_train_poly: ',X_train_poly.shape)
print('dimensie van X_test_poly: ',X_test_poly.shape)


# met L2 regularisatie via Ridge regression
lregmodel_poly = Ridge(alpha=0.5,tol=0.0001,fit_intercept=True)
lregmodel_poly.fit(X_train_poly,y_train)

print('R2 score op test set via L2: ',lregmodel_poly.score(X_test_poly,y_test))
# R2 -score via L2 op de trainingset
print('R2 score op training set via L2: ',lregmodel_poly.score(X_train_poly,y_train))

[ 5.19811337e-02 -7.40749157e-01 -7.29571082e-01  7.85265660e+01
  3.15669665e-01 -2.69983026e-01  9.02725261e+00 -1.75343162e-01
  2.16679900e-01  2.22213944e+00  1.35531518e-01 -2.81565300e-01
 -2.58148224e-01 -2.83519817e-01  2.59511881e-01  2.42169553e+00]
r2 score =  0.7854444801839378
dimensie van X_train_poly:  (8758, 153)
dimensie van X_test_poly:  (2190, 153)
R2 score op test set via L2:  0.797137724391862
R2 score op training set via L2:  0.7930137987283479


In [82]:
# Splitsen in features en targets

y = dataset['Cyl-Far-R'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# MinMax scaling

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

# Modeloptimalisatie en Hyperparameter tuning

# Aanmaken van de hogere orde features
graad = 2

poly = PolynomialFeatures(graad)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)
print('dimensie van X_train_poly: ',X_train_poly.shape)
print('dimensie van X_test_poly: ',X_test_poly.shape)


# met L2 regularisatie via Ridge regression
lregmodel_poly = Ridge(alpha=0.5,tol=0.0001,fit_intercept=True)
lregmodel_poly.fit(X_train_poly,y_train)

print('R2 score op test set via L2: ',lregmodel_poly.score(X_test_poly,y_test))
# R2 -score via L2 op de trainingset
print('R2 score op training set via L2: ',lregmodel_poly.score(X_train_poly,y_train))

[ 0.02323121 -1.209371   -0.15954038 -0.47974211  1.80151768 -0.149616
  0.23072825  0.38985111  0.13175713 -0.02177312  0.24693772 -0.00332356
  0.19220656 -0.14470803  0.02137858  1.31410167]
r2 score =  0.6373060284101044
dimensie van X_train_poly:  (8758, 153)
dimensie van X_test_poly:  (2190, 153)
R2 score op test set via L2:  0.6860210069029044
R2 score op training set via L2:  0.633489173172789


In [83]:
# Splitsen in features en targets

y = dataset['Sph-Far-L'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# MinMax scaling

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

# Modeloptimalisatie en Hyperparameter tuning

# Aanmaken van de hogere orde features
graad = 2

poly = PolynomialFeatures(graad)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)
print('dimensie van X_train_poly: ',X_train_poly.shape)
print('dimensie van X_test_poly: ',X_test_poly.shape)


# met L2 regularisatie via Ridge regression
lregmodel_poly = Ridge(alpha=0.5,tol=0.0001,fit_intercept=True)
lregmodel_poly.fit(X_train_poly,y_train)

print('R2 score op test set via L2: ',lregmodel_poly.score(X_test_poly,y_test))
# R2 -score via L2 op de trainingset
print('R2 score op training set via L2: ',lregmodel_poly.score(X_train_poly,y_train))

[ 6.57231376e-02 -8.39782909e-01 -7.20706807e-01  8.47515493e+00
  2.30738071e-02 -3.54777117e-01 -4.85721989e+00 -3.59477816e-02
  3.26379211e-01  2.78182040e+01  3.44170756e-01 -1.84391762e-01
  3.27116572e+00 -3.61570314e-01  1.27198934e-01  2.54246626e+00]
r2 score =  0.7886849665877386
dimensie van X_train_poly:  (8758, 153)
dimensie van X_test_poly:  (2190, 153)
R2 score op test set via L2:  0.8000005427449886
R2 score op training set via L2:  0.7866983230516125


In [84]:
# Splitsen in features en targets

y = dataset['Cyl-Far-L'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# MinMax scaling

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

# Modeloptimalisatie en Hyperparameter tuning

# Aanmaken van de hogere orde features
graad = 2

poly = PolynomialFeatures(graad)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)
print('dimensie van X_train_poly: ',X_train_poly.shape)
print('dimensie van X_test_poly: ',X_test_poly.shape)


# met L2 regularisatie via Ridge regression
lregmodel_poly = Ridge(alpha=0.5,tol=0.0001,fit_intercept=True)
lregmodel_poly.fit(X_train_poly,y_train)

print('R2 score op test set via L2: ',lregmodel_poly.score(X_test_poly,y_test))
# R2 -score via L2 op de trainingset
print('R2 score op training set via L2: ',lregmodel_poly.score(X_train_poly,y_train))

[ 1.29234728e-02 -1.29310520e+00 -2.09023339e-01 -1.88850290e-01
  1.98230762e-01  4.70314107e-03  2.47080749e-01 -1.02050228e-01
  2.88339184e-04 -2.26340738e-01  1.84891456e+00 -1.32720447e-01
  3.41442406e-01  4.83574866e-01  1.13948164e-01  1.42432557e+00]
r2 score =  0.6348829475403024
dimensie van X_train_poly:  (8758, 153)
dimensie van X_test_poly:  (2190, 153)
R2 score op test set via L2:  0.6771382303233542
R2 score op training set via L2:  0.6604556826510977


### Random forrest regressor

In [85]:
# Splitsen in features en targets

y = dataset['Sph-Far-R'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#random forest regressor
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train,y_train)

RFR_model.score(X_test,y_test)

0.8053537451207142

In [86]:
# Splitsen in features en targets

y = dataset['Cyl-Far-R'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


#random forest regressor
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train,y_train)

RFR_model.score(X_test,y_test)

0.6519286130828525

In [87]:
# Splitsen in features en targets

y = dataset['Sph-Far-L'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


#random forest regressor
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train,y_train)

RFR_model.score(X_test,y_test)

0.8056391482523008

In [89]:
# Splitsen in features en targets

y = dataset['Cyl-Far-L'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


#random forest regressor
RFR_model = RandomForestRegressor(n_estimators=1000)
RFR_model.fit(X_train,y_train)

RFR_model.score(X_test,y_test)

0.6511872609810122