In [50]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from scipy import stats
from scipy.stats import randint 
from scipy.stats import uniform
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [51]:
# import data
dataset_og = pd.read_excel('Eye_measurements_may_2020.xls')
# kopie maken indien we iets van de originele data nodig hebben
dataset = dataset_og.copy()
dataset.head(5)

Unnamed: 0,Externe ID,Geslacht,Geboortedatum,Oogmetingen/Datum,Oogmetingen/Sfr ver,Oogmetingen/Cyl ver,Oogmetingen/As ver,Oogmetingen/Add,Oogmetingen/Sfr dicht,Oogmetingen/Cyl dicht,Oogmetingen/As dicht,Oogmetingen/Sfr ver.1,Oogmetingen/Cyl ver.1,Oogmetingen/As ver.1,Oogmetingen/Add.1,Oogmetingen/Sfr dicht.1,Oogmetingen/Cyl dicht.1,Oogmetingen/As dicht.1
0,__export__.res_partner_10972_f0a5feb4,Vrouw,1962-07-24 00:00:00,2020-05-18 00:00:00,-1.25,-0.5,145.0,,,-1.0,145.0,-0.75,-1.0,20.0,,,-0.5,20.0
1,,,,2017-12-27 00:00:00,-1.25,-0.5,155.0,0.0,0.0,0.0,0.0,-0.75,-0.5,20.0,0.0,0.0,0.0,0.0
2,,,,2012-02-11 00:00:00,-1.0,-0.5,155.0,0.0,0.0,0.0,0.0,-0.5,-0.5,20.0,0.0,0.0,0.0,0.0
3,__export__.res_partner_7722_2342b746,Man,1982-07-10 00:00:00,2020-05-16 00:00:00,-2.0,-1.5,90.0,,,,,-1.5,-1.5,90.0,,,,
4,,,,2008-12-13 00:00:00,-1.25,-0.75,85.0,0.0,0.0,0.0,0.0,-1.25,-0.75,90.0,0.0,0.0,0.0,0.0


In [52]:
# forward fill toepassen voor alle records (omdat ze zo gemaakt zijn)
dataset = dataset.fillna(method='ffill')

In [53]:
# ID's omzetten naar 6-cijfer random getal om verbanden tussen patienten te vermeiden
dataset['ID'] = dataset['Externe ID'].astype('category').cat.rename_categories(random.sample(range(100000, 1000000), dataset['Externe ID'].nunique()))
dataset = dataset.drop(columns=['Externe ID'])
dataset.head(5)

Unnamed: 0,Geslacht,Geboortedatum,Oogmetingen/Datum,Oogmetingen/Sfr ver,Oogmetingen/Cyl ver,Oogmetingen/As ver,Oogmetingen/Add,Oogmetingen/Sfr dicht,Oogmetingen/Cyl dicht,Oogmetingen/As dicht,Oogmetingen/Sfr ver.1,Oogmetingen/Cyl ver.1,Oogmetingen/As ver.1,Oogmetingen/Add.1,Oogmetingen/Sfr dicht.1,Oogmetingen/Cyl dicht.1,Oogmetingen/As dicht.1,ID
0,Vrouw,1962-07-24 00:00:00,2020-05-18 00:00:00,-1.25,-0.5,145.0,,,-1.0,145.0,-0.75,-1.0,20.0,,,-0.5,20.0,656160
1,Vrouw,1962-07-24 00:00:00,2017-12-27 00:00:00,-1.25,-0.5,155.0,0.0,0.0,0.0,0.0,-0.75,-0.5,20.0,0.0,0.0,0.0,0.0,656160
2,Vrouw,1962-07-24 00:00:00,2012-02-11 00:00:00,-1.0,-0.5,155.0,0.0,0.0,0.0,0.0,-0.5,-0.5,20.0,0.0,0.0,0.0,0.0,656160
3,Man,1982-07-10 00:00:00,2020-05-16 00:00:00,-2.0,-1.5,90.0,0.0,0.0,0.0,0.0,-1.5,-1.5,90.0,0.0,0.0,0.0,0.0,237465
4,Man,1982-07-10 00:00:00,2008-12-13 00:00:00,-1.25,-0.75,85.0,0.0,0.0,0.0,0.0,-1.25,-0.75,90.0,0.0,0.0,0.0,0.0,237465


In [54]:
# replace geslacht met int
dataset[['Geslacht']] = dataset[['Geslacht']].replace({'Vrouw':0, 'Man':1, 'Overige':2, 0:2})

In [56]:
# zet de datums om naar datetime, n/a de out of bounds waarden en versnel het proces door het originele formaat te gebruiken
dataset[['Geboortedatum']] = pd.to_datetime(dataset['Geboortedatum'], errors='coerce', infer_datetime_format=True)
dataset[['Oogmetingen/Datum']] = pd.to_datetime(dataset['Oogmetingen/Datum'], errors='coerce', infer_datetime_format=True)
# drop de n/a waarden van vorige stap
dataset = dataset.dropna(subset=['Geboortedatum','Oogmetingen/Datum'])

KeyError: 'Geboortedatum'

In [None]:
# Bereken leeftijd op moment van meting voor makkelijkere correlatie
dataset['Leeftijd'] = (dataset['Oogmetingen/Datum'] - dataset['Geboortedatum']).astype('<m8[Y]')
# drop data van 100+ leeftijd voor model niet te confusen
dataset = dataset[dataset['Leeftijd'] <= 100]

In [None]:
# yeet deze kolommen want we hebben ze niet meer nodig
dataset = dataset.drop(columns=['Geboortedatum','Oogmetingen/Datum'])
dataset.sort_values(by=['Leeftijd'], ascending=True).head(20)