In [59]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import math
import time
import numpy as np
import pandas as pd
import pandas_datareader as pdr
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [60]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')

# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()
dataset = dataset.sort_values(by=['ID', 'Measurement_Age'], ascending = False)
dataset.shape

(39923, 16)

In [61]:
# Splitsen in features en targets
target = 'Sph-Far-R'
prefAmountRecords = 3


### Sliding window

#### met var prefAmountRecords kan je kiezen welke minimum records je wil, naargelang deze grafiek is de keuze te maken:
![Graph](https://i.imgur.com/82t9CSH.png)



In [62]:
# Drop waardes die minder dan n* records hebben
preDrop = dataset.shape[0]
dataset = dataset.groupby('ID').filter(lambda x: len(x) >= prefAmountRecords)
dataset = dataset.groupby('ID').head(prefAmountRecords)
postDrop = dataset.shape[0]

print(f'Dropped {preDrop-postDrop} records of {preDrop}, {math.floor((postDrop/preDrop)*100)}% remaining')

Dropped 24929 records of 39923, 37% remaining


In [63]:
# Sliding window
start = time.time()
# dataset = dataset[:500]
df_f = pd.DataFrame()
df = pd.DataFrame()

for uniqueId in dataset['ID'].unique():
    for i in range(prefAmountRecords):
        if len(dataset.loc[dataset['ID'] == uniqueId]) >= prefAmountRecords:
            if i == 0:
                df = pd.DataFrame(dataset.loc[dataset['ID'] == uniqueId].iloc[i]).T
                df.columns = ['ID', 'Sex'] + list((n + f'_{i}') for n in dataset.columns[2:])
            else:
                df_t = pd.DataFrame(dataset.loc[dataset['ID'] == uniqueId].iloc[i][2:]).T
                df_t.columns = list((n + f'_{i}') for n in dataset.columns[2:])
                df = pd.concat([df.reset_index(drop=True), df_t.reset_index(drop=True)], axis = 1)
    df_f = df_f.append(df)

dataset = df_f.copy()
end = time.time()
print(end - start)
dataset.head(20)

32.2130560874939


Unnamed: 0,ID,Sex,Measurement_Age_0,Add_0,Sph-Far-R_0,Cyl-Far-R_0,Axis-Far-R_0,Sph-Close-R_0,Cyl-Close-R_0,Axis-Close-R_0,Sph-Far-L_0,Cyl-Far-L_0,Axis-Far-L_0,Sph-Close-L_0,Cyl-Close-L_0,Axis-Close-L_0,Measurement_Age_1,Add_1,Sph-Far-R_1,Cyl-Far-R_1,Axis-Far-R_1,Sph-Close-R_1,Cyl-Close-R_1,Axis-Close-R_1,Sph-Far-L_1,Cyl-Far-L_1,Axis-Far-L_1,Sph-Close-L_1,Cyl-Close-L_1,Axis-Close-L_1,Measurement_Age_2,Add_2,Sph-Far-R_2,Cyl-Far-R_2,Axis-Far-R_2,Sph-Close-R_2,Cyl-Close-R_2,Axis-Close-R_2,Sph-Far-L_2,Cyl-Far-L_2,Axis-Far-L_2,Sph-Close-L_2,Cyl-Close-L_2,Axis-Close-L_2
0,999442.0,1.0,22635.0,2.0,4.25,0.75,10.0,6.25,0.75,10.0,4.0,0.75,5.0,6.0,0.75,5.0,22040.0,2.0,3.75,0.75,20.0,5.75,0.75,20.0,3.25,0.75,5.0,5.25,0.75,5.0,20199.0,2.0,3.75,0.75,20.0,5.75,0.75,20.0,3.25,0.75,5.0,5.25,0.75,5.0
0,999316.0,0.0,22719.0,0.0,1.25,0.0,0.0,1.25,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,22719.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.75,0.0,0.0,20595.0,2.25,0.75,0.0,0.0,3.0,0.0,0.0,0.75,0.0,0.0,3.0,0.0,0.0
0,998973.0,0.0,20920.0,0.0,1.5,1.0,80.0,1.5,1.0,80.0,2.0,0.75,80.0,2.0,0.75,80.0,18615.0,0.0,0.0,0.0,0.0,1.25,0.75,85.0,0.0,0.0,0.0,1.5,0.75,80.0,17407.0,0.0,0.0,0.0,0.0,0.75,0.75,90.0,0.0,0.0,0.0,1.5,0.25,90.0
0,998945.0,0.0,27111.0,2.5,0.25,0.5,15.0,2.75,0.5,15.0,1.75,0.5,10.0,4.25,0.5,10.0,21654.0,2.5,1.75,0.0,0.0,4.25,0.0,0.0,1.75,0.0,0.0,4.25,0.0,0.0,21204.0,0.0,1.75,0.0,0.0,4.0,0.0,0.0,1.75,0.0,0.0,4.0,0.0,0.0
0,998767.0,1.0,23160.0,0.0,3.0,0.5,90.0,3.0,0.5,90.0,3.0,0.5,90.0,3.0,0.5,90.0,23160.0,0.0,0.0,0.0,0.0,2.5,0.5,90.0,0.0,0.0,0.0,2.0,0.5,90.0,21749.0,0.0,0.0,0.0,0.0,1.75,0.5,90.0,0.0,0.0,0.0,1.75,0.5,90.0
0,998724.0,0.0,24712.0,0.0,2.75,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,24596.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24596.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,5.25,0.0,0.0
0,998167.0,1.0,17777.0,0.0,-3.25,0.5,120.0,-3.25,0.5,120.0,-3.5,0.5,60.0,-3.5,0.5,60.0,17350.0,0.0,-2.75,0.5,120.0,0.0,0.0,0.0,-3.0,0.5,60.0,0.0,0.0,0.0,17350.0,0.0,-1.75,0.5,120.0,0.0,0.0,0.0,-2.0,0.5,60.0,0.0,0.0,0.0
0,998031.0,0.0,8158.0,0.0,-4.75,0.0,0.0,-4.75,0.0,0.0,-3.75,0.0,0.0,-3.75,0.0,0.0,7324.0,0.0,-4.5,0.0,0.0,0.0,0.0,0.0,-3.25,0.0,0.0,0.0,0.0,0.0,6987.0,0.0,-3.25,0.0,0.0,0.0,0.0,0.0,-2.75,0.0,0.0,0.0,0.0,0.0
0,997916.0,0.0,24943.0,0.0,-3.75,0.0,0.0,-3.75,0.0,0.0,-3.0,0.0,0.0,-3.0,0.0,0.0,23486.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,20574.0,0.0,-4.0,0.0,0.0,0.0,0.0,0.0,-4.0,0.0,0.0,0.0,0.0,0.0
0,997693.0,1.0,7441.0,0.0,-2.75,0.0,0.0,-2.75,0.0,0.0,-2.75,0.0,0.0,-2.75,0.0,0.0,5998.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,5998.0,0.0,-1.75,0.0,0.0,0.0,0.0,0.0,-1.75,0.0,0.0,0.0,0.0,0.0


In [79]:
# # check for NaN
# a = 0
# for c in dataset.columns:
#     a += len(df) - df[c].count()
#     print(f'{c}: {len(df) - df[c].count()}')
# print(f'total: {a}')

ID: 0
Sex: 0
Measurement_Age_0: 0
Add_0: 0
Sph-Far-R_0: 0
Cyl-Far-R_0: 0
Axis-Far-R_0: 0
Sph-Close-R_0: 0
Cyl-Close-R_0: 0
Axis-Close-R_0: 0
Sph-Far-L_0: 0
Cyl-Far-L_0: 0
Axis-Far-L_0: 0
Sph-Close-L_0: 0
Cyl-Close-L_0: 0
Axis-Close-L_0: 0
Measurement_Age_1: 0
Add_1: 0
Sph-Far-R_1: 0
Cyl-Far-R_1: 0
Axis-Far-R_1: 0
Sph-Close-R_1: 0
Cyl-Close-R_1: 0
Axis-Close-R_1: 0
Sph-Far-L_1: 0
Cyl-Far-L_1: 0
Axis-Far-L_1: 0
Sph-Close-L_1: 0
Cyl-Close-L_1: 0
Axis-Close-L_1: 0
Measurement_Age_2: 0
Add_2: 0
Sph-Far-R_2: 0
Cyl-Far-R_2: 0
Axis-Far-R_2: 0
Sph-Close-R_2: 0
Cyl-Close-R_2: 0
Axis-Close-R_2: 0
Sph-Far-L_2: 0
Cyl-Far-L_2: 0
Axis-Far-L_2: 0
Sph-Close-L_2: 0
Cyl-Close-L_2: 0
Axis-Close-L_2: 0
total: 0


In [71]:
# Splitsen in training set en test set
y = dataset[f'{target}_{prefAmountRecords-1}'].values
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [76]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train, y_train)
print('r2 score = ', lregModel.score(X_test,y_test))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [73]:
# hogere orde features
graad = 1
n_iter_search = 500
parameters = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'alpha': np.linspace(0.0001,n_iter_search,50), 
    'tol': np.linspace(0.0001,n_iter_search,50), 
    'fit_intercept': [True, False]
    }

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

# Cross-validation via random search
lregPolyModel = Ridge()
lregPolyRandomModel = RandomizedSearchCV(lregPolyModel, param_distributions=parameters, cv=5, n_iter=n_iter_search, n_jobs = -1, verbose=0)
lregPolyRandomModel = lregPolyRandomModel.fit(X_train_poly, y_train)

print('Best accuracy : ', lregPolyRandomModel.best_score_)
print(f'Best parameters : {lregPolyRandomModel.best_params_} with {n_iter_search} searches')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [74]:
# Random Forest Regression met polynominal features
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train_poly, y_train)

print(RFR_model.score(X_test_poly, y_test))

NameError: name 'X_train_poly' is not defined