In [48]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import math
import time
import numpy as np
import pandas as pd
import pandas_datareader as pdr
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [49]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')

# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()
dataset = dataset.sort_values(by=['ID', 'Measurement_Age'], ascending = False)
dataset.shape

(39923, 16)

In [50]:
# Splitsen in features en targets
target = 'Sph-Far-R'
prefAmountRecords = 3


### Sliding window

#### met var prefAmountRecords kan je kiezen welke minimum records je wil, naargelang deze grafiek is de keuze te maken:
![Graph](https://i.imgur.com/82t9CSH.png)



In [51]:
# Drop waardes die minder dan n* records hebben
preDrop = dataset.shape[0]
dataset = dataset.groupby('ID').filter(lambda x: len(x) >= prefAmountRecords)
dataset = dataset.groupby('ID').head(prefAmountRecords)
postDrop = dataset.shape[0]

print(f'Dropped {preDrop-postDrop} records of {preDrop}, {math.floor((postDrop/preDrop)*100)}% remaining')

Dropped 24929 records of 39923, 37% remaining


In [52]:
# Sliding window
start = time.time()
# dataset = dataset[:500]
df_f = pd.DataFrame()
df = pd.DataFrame()

for uniqueId in dataset['ID'].unique():
    for i in range(prefAmountRecords):
        if len(dataset.loc[dataset['ID'] == uniqueId]) >= prefAmountRecords:
            if i == 0:
                df = pd.DataFrame(dataset.loc[dataset['ID'] == uniqueId].iloc[i]).T
                df.columns = ['ID', 'Sex'] + list((n + f'_{i}') for n in dataset.columns[2:])
            else:
                df_t = pd.DataFrame(dataset.loc[dataset['ID'] == uniqueId].iloc[i][2:]).T
                df_t.columns = list((n + f'_{i}') for n in dataset.columns[2:])
                df = pd.concat([df.reset_index(drop=True), df_t.reset_index(drop=True)], axis = 1)
    df_f = df_f.append(df)

dataset = df_f.copy()
end = time.time()
print(end - start)
dataset.head(5)
dataset.to_csv(f'Data\\Measurements-Sliding-Window-{prefAmountRecords}', index=False, index_label=False)

30.53615641593933


In [None]:
for coll in dataset.columns:
    coll = np.nan_to_num(coll)
pd.DataFrame(dataset).fillna(0, inplace=True)
np.where(np.isnan(dataset))

### Gewone Lreg + Poly met sliding window

In [60]:
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [61]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train, y_train)
print('r2 score = ', lregModel.score(X_test, y_test))

r2 score =  0.7612018613679994


In [62]:
# hogere orde features
graad = 1
n_iter_search = 500
parameters = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'alpha': np.linspace(0.0001,n_iter_search,50), 
    'tol': np.linspace(0.0001,n_iter_search,50), 
    'fit_intercept': [True, False]
    }

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train, X_test)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

# Cross-validation via random search
lregPolyModel = Ridge()
lregPolyRandomModel = RandomizedSearchCV(lregPolyModel, param_distributions=parameters, cv=5, n_iter=n_iter_search, n_jobs = -1, verbose=0)
lregPolyRandomModel = lregPolyRandomModel.fit(X_train_poly, y_train)

print('Best r2 : ', lregPolyRandomModel.best_score_)
print(f'Best parameters : {lregPolyRandomModel.best_params_} after {n_iter_search} searches')

Dimensie van polynomial data op graad 1: (3998, 31)
Best accuracy :  0.820093996697312
Best parameters : {'tol': 377.5510448979592, 'solver': 'auto', 'fit_intercept': False, 'alpha': 183.67353265306122} with 500 searches


### Genormaliseerde Lreg + Poly met sliding window

In [None]:
# Splitsen in training set en test set
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop([c for c in dataset.columns if f'_{prefAmountRecords-1}' in c], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

In [None]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train, y_train)
print('r2 score = ', lregModel.score(X_test, y_test))

In [None]:
# hogere orde features
graad = 1
n_iter_search = 500
parameters = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'alpha': np.linspace(0.0001,n_iter_search,50), 
    'tol': np.linspace(0.0001,n_iter_search,50), 
    'fit_intercept': [True, False]
    }

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train, X_test)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

# Cross-validation via random search
lregPolyModel = Ridge()
lregPolyRandomModel = RandomizedSearchCV(lregPolyModel, param_distributions=parameters, cv=5, n_iter=n_iter_search, n_jobs = -1, verbose=0)
lregPolyRandomModel = lregPolyRandomModel.fit(X_train_poly, y_train)

print('Best r2 : ', lregPolyRandomModel.best_score_)
print(f'Best parameters : {lregPolyRandomModel.best_params_} after {n_iter_search} searches')

### Random Forest Regression met tuning

In [47]:
# Random Forest Regression met polynominal features
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train_poly, y_train)

print(RFR_model.score(X_test_poly, y_test))

NameError: name 'X_train_poly' is not defined