In [13]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [14]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')
# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()
dataset.head(5)

Unnamed: 0,ID,Sex,Measurement_Age,Add,Sph-Far-R,Cyl-Far-R,Axis-Far-R,Sph-Close-R,Cyl-Close-R,Axis-Close-R,Sph-Far-L,Cyl-Far-L,Axis-Far-L,Sph-Close-L,Cyl-Close-L,Axis-Close-L
0,203795.0,0.0,21118.0,0.0,-1.75,0.5,55.0,-2.25,1.0,55.0,-1.75,1.0,110.0,-1.25,0.5,110.0
1,203795.0,0.0,20245.0,0.0,-1.75,0.5,65.0,0.0,0.0,0.0,-1.25,0.5,110.0,0.0,0.0,0.0
2,203795.0,0.0,18099.0,0.0,-1.5,0.5,65.0,0.0,0.0,0.0,-1.0,0.5,110.0,0.0,0.0,0.0
3,546632.0,1.0,13825.0,0.0,-3.5,1.5,180.0,-3.5,1.5,180.0,-3.0,1.5,180.0,-3.0,1.5,180.0
4,546632.0,1.0,9653.0,0.0,-2.0,0.75,175.0,0.0,0.0,0.0,-2.0,0.75,180.0,0.0,0.0,0.0


In [15]:
#drop rijen waar < n meting van zijn en houd van de overige de top n meest recente waardes
dataset = dataset.groupby('ID').filter(lambda x: len(x) > 1)
dataset = dataset.groupby('ID').head(2)

In [16]:
dataset.head()

Unnamed: 0,ID,Sex,Measurement_Age,Add,Sph-Far-R,Cyl-Far-R,Axis-Far-R,Sph-Close-R,Cyl-Close-R,Axis-Close-R,Sph-Far-L,Cyl-Far-L,Axis-Far-L,Sph-Close-L,Cyl-Close-L,Axis-Close-L
0,203795.0,0.0,21118.0,0.0,-1.75,0.5,55.0,-2.25,1.0,55.0,-1.75,1.0,110.0,-1.25,0.5,110.0
1,203795.0,0.0,20245.0,0.0,-1.75,0.5,65.0,0.0,0.0,0.0,-1.25,0.5,110.0,0.0,0.0,0.0
3,546632.0,1.0,13825.0,0.0,-3.5,1.5,180.0,-3.5,1.5,180.0,-3.0,1.5,180.0,-3.0,1.5,180.0
4,546632.0,1.0,9653.0,0.0,-2.0,0.75,175.0,0.0,0.0,0.0,-2.0,0.75,180.0,0.0,0.0,0.0
7,474866.0,0.0,25627.0,3.0,2.25,0.75,90.0,5.25,1.0,90.0,2.75,1.0,95.0,5.75,0.75,95.0


In [17]:
#2 rijen naast elkaar zetten

dataset = dataset.merge(dataset ,on=['ID', 'Sex'], suffixes=['_x', ''])
dataset = dataset.sort_values(by=['ID', 'Measurement_Age_x'])
dataset = dataset.drop_duplicates(subset=['ID', 'Sex'], keep='first')
dataset.head(20)
# dataset.loc[dataset['ID'] == 471318.0]

Unnamed: 0,ID,Sex,Measurement_Age_x,Add_x,Sph-Far-R_x,Cyl-Far-R_x,Axis-Far-R_x,Sph-Close-R_x,Cyl-Close-R_x,Axis-Close-R_x,Sph-Far-L_x,Cyl-Far-L_x,Axis-Far-L_x,Sph-Close-L_x,Cyl-Close-L_x,Axis-Close-L_x,Measurement_Age,Add,Sph-Far-R,Cyl-Far-R,Axis-Far-R,Sph-Close-R,Cyl-Close-R,Axis-Close-R,Sph-Far-L,Cyl-Far-L,Axis-Far-L,Sph-Close-L,Cyl-Close-L,Axis-Close-L
19520,100104.0,1.0,21234.0,0.0,-5.0,2.5,80.0,0.0,0.0,0.0,-2.25,1.75,105.0,0.0,0.0,0.0,21234.0,0.0,-5.0,2.5,80.0,0.0,0.0,0.0,-2.25,1.75,105.0,0.0,0.0,0.0
35074,100142.0,0.0,28783.0,3.0,-1.0,0.0,0.0,2.0,0.5,0.0,1.5,0.5,0.0,4.5,0.0,0.0,29440.0,3.0,-1.0,0.5,0.0,2.0,0.5,0.0,1.5,0.5,180.0,4.5,0.5,180.0
774,100227.0,0.0,16206.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,16799.0,0.0,1.5,0.0,0.0,1.5,0.0,0.0,1.5,0.0,0.0,1.5,0.0,0.0
40822,100337.0,1.0,9696.0,0.0,-6.5,0.0,0.0,0.0,0.0,0.0,-4.5,0.0,0.0,0.0,0.0,0.0,12046.0,0.0,-6.5,0.0,0.0,-6.5,0.0,0.0,-4.5,0.0,0.0,-4.5,0.0,0.0
28952,100480.0,1.0,14939.0,0.0,-6.0,0.0,0.0,-6.0,0.0,0.0,-5.75,0.0,0.0,-5.75,0.0,0.0,14939.0,0.0,-6.0,0.0,0.0,-6.0,0.0,0.0,-5.75,0.0,0.0,-5.75,0.0,0.0
2478,100592.0,0.0,3935.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,180.0,0.0,0.0,0.0,13217.0,0.0,-2.5,1.75,10.0,-2.5,1.75,10.0,-2.0,1.25,180.0,-2.0,1.25,180.0
32234,100820.0,1.0,16642.0,0.0,0.0,0.0,0.0,1.0,0.25,90.0,0.0,0.0,0.0,1.0,0.25,90.0,16694.0,0.0,-0.25,0.25,90.0,-0.25,0.25,90.0,-0.25,0.25,90.0,-0.25,0.25,90.0
32136,101042.0,0.0,29548.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,29548.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0,4.5,0.0,0.0
5298,101121.0,1.0,17566.0,1.25,-2.75,0.75,10.0,-1.25,0.5,10.0,-2.25,0.5,160.0,-1.25,0.75,160.0,18537.0,2.25,-2.75,0.75,175.0,-1.0,1.25,175.0,-3.25,1.25,165.0,-0.5,0.75,165.0
20532,101146.0,1.0,20355.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,2.75,0.0,0.0,2.75,0.0,0.0,20355.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,2.75,0.0,0.0,2.75,0.0,0.0


In [18]:
# Splitsen in features en targets

y = dataset['Sph-Far-R'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

[-4.77178249e-08  5.20950853e-02 -2.15404304e-05 -1.62292938e-01
  8.46387909e-01  1.26678938e-01 -1.49802351e-03  1.05042936e-01
 -8.71517733e-02  1.19959912e-03  6.66248110e-02  5.36629405e-02
 -1.56357916e-03 -1.03324726e-02 -1.26054061e-01  1.44376610e-03
  7.08923087e-05]
r2 score =  0.7854733236320078


In [19]:
# Splitsen in features en targets

y = dataset['Cyl-Far-R'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

[ 1.62252018e-08  2.31924656e-02 -3.54046533e-05 -3.53969631e-02
 -5.08411672e-03  7.20467297e-01 -8.31839999e-04  2.65742466e-03
  1.94748807e-01  7.33404729e-04 -7.49329626e-04  9.89619864e-02
 -1.86928866e-05  7.69048744e-03 -6.42992911e-02  1.18078478e-04
  3.85810998e-05]
r2 score =  0.6372139138572105


In [20]:
# Splitsen in features en targets

y = dataset['Sph-Far-L'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

[-4.58530998e-08  6.58326362e-02 -2.44410194e-05 -1.60316610e-01
  9.11267799e-02  9.62453117e-03 -1.96917534e-03 -5.64073446e-02
 -1.74743964e-02  1.80920301e-03  8.30675019e-01  1.37140115e-01
 -1.02375245e-03  1.30840341e-01 -1.60741402e-01  7.08614707e-04
  7.44390485e-05]
r2 score =  0.7887626755735183


In [22]:
# Splitsen in features en targets

y = dataset['Cyl-Far-L'].values
X = dataset.drop(['Add', 'Sph-Far-R', 'Cyl-Far-R', 'Axis-Far-R', 'Sph-Close-R', 'Cyl-Close-R', 'Axis-Close-R', 'Sph-Far-L', 
                  'Cyl-Far-L', 'Axis-Far-L', 'Sph-Close-L', 'Cyl-Close-L', 'Axis-Close-L'],axis=1)

# Splitsen in training set en test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear regression

regmodel = linear_model.LinearRegression()
regmodel.fit(X_train,y_train)
print(regmodel.coef_)
r2 = regmodel.score(X_test,y_test)
print('r2 score = ', r2)

[ 6.40531189e-09  1.29081767e-02 -3.78284354e-05 -4.64273442e-02
 -2.00126108e-03  7.92371252e-02  2.58759087e-05  2.86298021e-03
 -5.10948895e-02  2.16272342e-06 -6.79567499e-03  7.39639607e-01
 -7.37426091e-04  1.36585746e-02  2.14928237e-01  6.32772404e-04
  4.17873828e-05]
r2 score =  0.6348584369819801
