In [92]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import math
import time
import numpy as np
import pandas as pd
import pandas_datareader as pdr
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [93]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')

# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()
dataset = dataset.sort_values(by=['ID', 'Measurement_Age'], ascending = False)
dataset.shape

(39923, 16)

In [94]:
# Keuze target en aantal sliding window records
target = 'Sph-Far-R'
prefAmountRecords = 3

### Sliding window

#### met var prefAmountRecords kan je kiezen welke minimum records je wil, naargelang deze grafiek is de keuze te maken:
![Graph](https://i.imgur.com/82t9CSH.png)



In [95]:
# Drop waardes die minder dan n* records hebben
preDrop = dataset.shape[0]
dataset = dataset.groupby('ID').filter(lambda x: len(x) >= prefAmountRecords)
dataset = dataset.groupby('ID').head(prefAmountRecords)
postDrop = dataset.shape[0]

print(f'Dropped {preDrop-postDrop} records of {preDrop}, {math.floor((postDrop/preDrop)*100)}% remaining')

Dropped 24929 records of 39923, 37% remaining


In [96]:
# Sliding window
start = time.time()
# dataset = dataset[:500]
df_f = pd.DataFrame()
df = pd.DataFrame()

for uniqueId in dataset['ID'].unique():
    for i in range(prefAmountRecords):
        if len(dataset.loc[dataset['ID'] == uniqueId]) >= prefAmountRecords:
            if i == 0:
                df = pd.DataFrame(dataset.loc[dataset['ID'] == uniqueId].iloc[i]).T
                df.columns = ['ID', 'Sex'] + list((n + f'_{i}') for n in dataset.columns[2:])
            else:
                df_t = pd.DataFrame(dataset.loc[dataset['ID'] == uniqueId].iloc[i][2:]).T
                df_t.columns = list((n + f'_{i}') for n in dataset.columns[2:])
                df = pd.concat([df.reset_index(drop=True), df_t.reset_index(drop=True)], axis = 1)
    df_f = df_f.append(df)

dataset = df_f.copy()
end = time.time()
print(end - start)
dataset.head(5)
dataset.to_csv(f'Data\\Measurements-Sliding-Window-{prefAmountRecords}', index=False, index_label=False)

30.168344020843506


In [97]:
for coll in dataset.columns:
    coll = np.nan_to_num(coll)
pd.DataFrame(dataset).fillna(0, inplace=True)
np.where(np.isnan(dataset))

(array([], dtype=int64), array([], dtype=int64))

### Lreg

In [98]:
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [99]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train, y_train)
print('r2 score = ', lregModel.score(X_test, y_test))

r2 score =  0.7612018613679994


### Lreg + Poly = beter

In [100]:
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [101]:
# hogere orde features
graad = 1
n_iter_search = 500
parameters = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'alpha': np.linspace(0.0001,n_iter_search,50), 
    'tol': np.linspace(0.0001,n_iter_search,50), 
    'fit_intercept': [True, False]
    }

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train, X_test)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

# Cross-validation via random search
lregPolyModel = Ridge()
lregPolyRandomModel = RandomizedSearchCV(lregPolyModel, param_distributions=parameters, cv=5, n_iter=n_iter_search, n_jobs = -1, verbose=0)
lregPolyRandomModel = lregPolyRandomModel.fit(X_train_poly, y_train)

print('Best r2 : ', lregPolyRandomModel.best_score_)
print(f'Best parameters : {lregPolyRandomModel.best_params_} after {n_iter_search} searches')

Dimensie van polynomial data op graad 1: (3998, 31)
Best r2 :  0.820093996697312
Best parameters : {'tol': 214.28577142857142, 'solver': 'auto', 'fit_intercept': False, 'alpha': 183.67353265306122} after 500 searches


### Gescalede Lreg = slechter

In [102]:
# Splitsen in training set en test set
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [103]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train, y_train)
print('r2 score = ', lregModel.score(X_test, y_test))

r2 score =  0.7612018613679452


### Gescalede Lreg + Poly = slechter

In [104]:
# Splitsen in training set en test set
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [105]:
# hogere orde features
graad = 1
n_iter_search = 500
parameters = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'alpha': np.linspace(0.0001,n_iter_search,50), 
    'tol': np.linspace(0.0001,n_iter_search,50), 
    'fit_intercept': [True, False]
    }

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train, X_test)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

# Cross-validation via random search
lregPolyModel = Ridge()
lregPolyRandomModel = RandomizedSearchCV(lregPolyModel, param_distributions=parameters, cv=5, n_iter=n_iter_search, n_jobs = -1, verbose=0)
lregPolyRandomModel = lregPolyRandomModel.fit(X_train_poly, y_train)

print('Best r2 : ', lregPolyRandomModel.best_score_)
print(f'Best parameters : {lregPolyRandomModel.best_params_} after {n_iter_search} searches')

Dimensie van polynomial data op graad 1: (3998, 31)
Best r2 :  0.8197057396121817
Best parameters : {'tol': 214.28577142857142, 'solver': 'auto', 'fit_intercept': True, 'alpha': 40.81641836734694} after 500 searches


### Random Forest Regression

In [106]:
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [107]:
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train, y_train)

print(RFR_model.score(X_test, y_test))

0.7795784517360235


### Random Forest Regression + poly

In [108]:
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [109]:
graad = 1

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train, X_test)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

Dimensie van polynomial data op graad 1: (3998, 31)


In [110]:
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train, y_train)

print(RFR_model.score(X_test, y_test))

0.7821074426241585


### Random Forest Regression + tuning

In [111]:
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [112]:
n_iter_search = 50

# Cross-validation via grid search - hoogste balanced accuracy
model = RandomForestRegressor(n_estimators=n_estimators)
paramaters = {'n_estimators':[100, 500, 1000], 'criterion':['mse', 'mae'], 'max_depth': np.linspace(1, 100), 'min_weight_fraction_leaf':np.linspace(0.0, 1.0), 'max_features': ['auto', 'sqrt', 'log2', None]}
random_search = RandomizedSearchCV(estimator = model, 
                           param_distributions = paramaters,
                           cv = 3,
                           n_jobs = -1,
                           verbose = 5,
                           n_iter = n_iter_search)
random_search = random_search.fit(X_train, y_train)

print(random_search.best_params_)
print(random_search.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.1min finished
{'n_estimators': 100, 'min_weight_fraction_leaf': 0.0, 'max_features': 'sqrt', 'max_depth': 100.0, 'criterion': 'mse'}
0.8264957481372229


### Random Forest Regression + tuning + poly

In [113]:
# Splitsen in training set en test set
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [114]:
graad = 1

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train, X_test)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

Dimensie van polynomial data op graad 1: (3998, 31)


In [115]:
n_iter_search = 50

# Cross-validation via grid search - hoogste balanced accuracy
model = RandomForestRegressor(n_estimators=n_estimators)
paramaters = {'n_estimators':[100, 500, 1000], 'criterion':['mse', 'mae'], 'max_depth': np.linspace(1, 100), 'min_weight_fraction_leaf':np.linspace(0.0, 1.0), 'max_features': ['auto', 'sqrt', 'log2', None]}
random_search = RandomizedSearchCV(estimator = model, 
                           param_distributions = paramaters,
                           cv = 3,
                           n_jobs = -1,
                           verbose = 5,
                           n_iter = n_iter_search)
random_search = random_search.fit(X_train, y_train)

print(random_search.best_params_)
print(random_search.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.4min finished
{'n_estimators': 1000, 'min_weight_fraction_leaf': 0.0, 'max_features': 'sqrt', 'max_depth': 21.204081632653057, 'criterion': 'mse'}
0.8287650670522225
