In [4]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
import pandas_datareader as pdr
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [5]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')

# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()
dataset.shape

(39923, 16)

In [6]:
# Splitsen in features en targets
target = 'Sph-Far-R'
prefAmountRecords = 2


### Sliding window

#### met var prefAmountRecords kan je kiezen welke minimum records je wil, naargelang deze grafiek is de keuze te maken:
![Graph](https://i.imgur.com/82t9CSH.png)



In [13]:
# Drop waardes die minder dan n* records hebben
preDrop = dataset.shape[0]
dataset = dataset.groupby('ID').filter(lambda x: len(x) >= prefAmountRecords)
dataset = dataset.groupby('ID').head(prefAmountRecords)
postDrop = dataset.shape[0]

print(f'Dropped {preDrop-postDrop} records of {preDrop}, {math.floor((postDrop/preDrop)*100)}% remaining')

Dropped 22839 records of 39923, 42% remaining


In [28]:
measurements = []
temp_df = pd.DataFrame()

for i in range(prefAmountRecords):
    for skrr in dataset['ID']:
        temp_df.append(dataset.loc[dataset['ID'] == skrr].iloc[i], ignore_index = True)
    measurements.append(temp_df)

measurements = [m.set_index('ID') for m in measurements]
measurements[0].join(measurements[1:])

test = measurements[0]
test.head(20)

ID  Sex  Measurement_Age  Add  Sph-Far-R  Cyl-Far-R  Axis-Far-R  \
0  850679.0  0.0          21118.0  0.0      -1.75        0.5        55.0   
1  850679.0  0.0          20245.0  0.0      -1.75        0.5        65.0   

   Sph-Close-R  Cyl-Close-R  Axis-Close-R  Sph-Far-L  Cyl-Far-L  Axis-Far-L  \
0        -2.25          1.0          55.0      -1.75        1.0       110.0   
1         0.00          0.0           0.0      -1.25        0.5       110.0   

   Sph-Close-L  Cyl-Close-L  Axis-Close-L  
0        -1.25          0.5         110.0  
1         0.00          0.0           0.0  
ID                 850679.00
Sex                     0.00
Measurement_Age     21118.00
Add                     0.00
Sph-Far-R              -1.75
Cyl-Far-R               0.50
Axis-Far-R             55.00
Sph-Close-R            -2.25
Cyl-Close-R             1.00
Axis-Close-R           55.00
Sph-Far-L              -1.75
Cyl-Far-L               1.00
Axis-Far-L            110.00
Sph-Close-L            -1.25
Cyl-

TypeError: 'type' object is not subscriptable

In [70]:
# Splitsen in training set en test set
y = dataset[target].values
X = dataset.drop(target, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [71]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train,y_train)
print('r2 score = ', lregModel.score(X_test,y_test))

r2 score =  0.7923816718901516


In [72]:
# hogere orde features
graad = 1
n_iter_search = 500
parameters = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'alpha': np.linspace(0.0001,n_iter_search,50), 
    'tol': np.linspace(0.0001,n_iter_search,50), 
    'fit_intercept': [True, False]
    }

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

# Cross-validation via random search
lregPolyModel = Ridge()
lregPolyRandomModel = RandomizedSearchCV(lregPolyModel, param_distributions=parameters, cv=5, n_iter=n_iter_search, n_jobs = -1, verbose=0)
lregPolyRandomModel = lregPolyRandomModel.fit(X_train_poly, y_train)

print('Best accuracy : ', lregPolyRandomModel.best_score_)
print(f'Best parameters : {lregPolyRandomModel.best_params_} with {n_iter_search} searches')

Dimensie van polynomial data op graad 1: (8758, 17)
Best accuracy :  0.7711157767153513
Best parameters : {'tol': 132.65313469387755, 'solver': 'svd', 'fit_intercept': True, 'alpha': 122.44905510204082} with 500 searches


In [73]:
# Random Forest Regression met polynominal features
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train_poly, y_train)

print(RFR_model.score(X_test_poly, y_test))

0.6608096114137644
