In [198]:
%matplotlib inline

import random

import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import randint, uniform
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, mean_absolute_error, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [199]:
# import data
dataset_og = pd.read_csv('Data\Measurements-Transformed')
# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()

In [200]:
# Splitsen in features en targets
target = 'Sph-Far-R'

y = dataset[target].values
X = dataset.drop(target, axis=1)

### Eerst proberen per feat een model te maken

*bij verandering van target gewoon onderstaande var veranderen dan zou de rest moeten updaten*

In [201]:
# Splitsen in train en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42069)

In [202]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train,y_train)
print('r2 score = ', lregModel.score(X_test,y_test))

r2 score =  0.8529754825718101


In [203]:
# test hogere orde features
graad = 2

polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print('dimensie van X_train_poly: ', X_train_poly.shape)


# L2 regularisatie via Ridge regression
lregPolyModel = Ridge(alpha=0.5, tol=0.0001, fit_intercept=True)
lregPolyModel.fit(X_train_poly,y_train)

print('R2 score op training set via L2: ', lregPolyModel.score(X_train_poly,y_train))
print('R2 score op test set via L2: ', lregPolyModel.score(X_test_poly,y_test))
print('Intercept score op test set via L2: ', lregPolyModel.intercept_)

dimensie van X_train_poly:  (41911, 136)
R2 score op training set via L2:  0.9139076623215756
R2 score op test set via L2:  0.8632385569982404
Intercept score op test set via L2:  0.08676219765320664


In [204]:
# Aanmaken van de hogere orde features
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train,y_train)

print(RFR_model.score(X_test,y_test))

0.9307249240474735


### Sliding window

In [205]:
# # Sliding window
# window_size = 2
# X_window = []
# y_window = []

# for w in range(window_size, X.shape[0]):
#     X_window.append(X[w-window_size:w,:].reshape(1,-1))
#     y_window.append(y[w-1])

# X_window = np.asarray(X_window)
# X_window = np.squeeze(X_window, axis=1)
# y_window = np.asarray(y_window)

# X_train, X_test, y_train, y_test = train_test_split(X_window, y_window, test_size=0.2, random_state=0 )

# # Aanmaken van de hogere orde features
# RFR_model.fit(X_train,y_train)
# RFR_model.score(X_test,y_test)

### Rijen aan elkaar plakken

#### met var prefAmountRecords kan je kiezen welke minimum records je wil, naargelang deze grafiek is de keuze te maken:
![Graph](https://i.imgur.com/82t9CSH.png)



In [206]:
# Drop waardes die minder dan n* records hebben
prefAmountRecords = 2

preDrop = dataset.shape[0]
dataset = dataset.groupby('ID').filter(lambda x: len(x) >= prefAmountRecords)
dataset = dataset.groupby('ID').head(prefAmountRecords)
postDrop = dataset.shape[0]

print(f'Dropped {preDrop-postDrop} records of {preDrop}, {math.floor((postDrop/preDrop)*100)}% remaining')

Dropped 30493 records of 52389, 41% remaining


In [212]:
# print(f"merging {prefAmountRecords} records per ID for a total of {len(dataset.groupby('ID'))} unique ID's. Current shape: {dataset.shape[0]}:{dataset.shape[1]}")
# mergeSet = dataset.copy()
# df = pd.DataFrame(data=dataset['ID'])
# for i in range(prefAmountRecords):
#     print(i)
#     df = df.merge(right=mergeSet, on=['ID'], suffixes=['', f'_{i}'])
#     print(df.head(5))

# print(f'Shape after merge: {dataset.shape[0]}:{dataset.shape[1]}')

# print(dataset.columns)
# dataset = dataset.sort_values(by=['Measurement_Age'])
# #dataset = dataset.loc[:,~df.columns.duplicated()] # https://stackoverflow.com/questions/14984119 
# dataset.head(10)

# 3 rijen --------------------------------------------------------------------------------------------------------------------

# from functools import reduce
# dfs = [dataset, dataset, dataset]
# dataset = reduce(lambda left, right: pd.merge(left, right, on=['ID', 'Sex'], suffixes=['_1', '_2']), dfs)

# dataset = dataset.sort_values(by=['ID', 'Measurement_Age_1'])
# dataset = dataset.groupby('ID').head(4)
# dataset = dataset.drop_duplicates(subset=['ID', 'Sex'], keep='last')

# y = dataset[f'{target}']
# X = dataset.drop(columns=[col for col in dataset.columns[30:] ])
# X.head(20)


# 2 rijen naast elkaar zetten -------------------------------------------------------------------------------------------------

dataset = dataset.merge(dataset ,on=['ID', 'Sex'], suffixes=['_0', '_1'])
dataset = dataset.sort_values(by=['ID', 'Measurement_Age_0'])
dataset = dataset.drop_duplicates(subset=['ID', 'Sex'], keep='first')

# Splitsen in features en targets
y = dataset[f'{target}_{prefAmountRecords-1}']
X = dataset.drop(columns=[col for col in dataset.columns if f'_{prefAmountRecords-1}' in col])



### Model trainen op gekozen feature en aantal records to merge

In [214]:
# Splitsen in training set en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42069)

In [215]:
# Linear regression
lregModel = linear_model.LinearRegression()
lregModel.fit(X_train,y_train)
print('r2 score = ', lregModel.score(X_test,y_test))

r2 score =  0.7591257461930679


In [235]:
# hogere orde features
graad = 1
n_iter_search = 500
parameters = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'alpha': np.linspace(0.0001,n_iter_search,50), 
    'tol': np.linspace(0.0001,n_iter_search,50), 
    'fit_intercept': [True, False]
    }

# Polynominal features aanmaken
polyModel = PolynomialFeatures(graad)
polyModel.fit(X_train)
X_train_poly = polyModel.transform(X_train)
X_test_poly = polyModel.transform(X_test)
print(f'Dimensie van polynomial data op graad {graad}: {X_train_poly.shape}')

# Cross-validation via random search
lregPolyModel = Ridge()
lregPolyRandomModel = RandomizedSearchCV(lregPolyModel, param_distributions=parameters, cv=5, n_iter=n_iter_search, n_jobs = -1, verbose=0)
lregPolyRandomModel = lregPolyRandomModel.fit(X_train_poly, y_train)

print('Best accuracy : ', lregPolyRandomModel.best_score_)
print(f'Best parameters : {lregPolyRandomModel.best_params_} with {n_iter_search} searches')

Dimensie van polynomial data op graad 1: (8758, 17)
Best accuracy :  0.7786080758977277
Best parameters : {'tol': 234.69393061224488, 'solver': 'auto', 'fit_intercept': False, 'alpha': 0.0001} with 500 searches
[ 1.96150261  2.20225527 -0.1704296  ... -4.22469579  0.82246156
  0.65683431]


In [236]:
# Random Forest Regression met polynominal features
RFR_model = RandomForestRegressor(n_estimators=100)
RFR_model.fit(X_train_poly, y_train)

print(RFR_model.score(X_test_poly, y_test))

0.7772694530432923
