In [1]:
#lets try to predict dementia using a longitudinal dataset 

In [2]:

import numpy as np
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

In [3]:
data  = pd.read_csv('oasis_longitudinal.csv')

In [4]:
print(data)

    Subject ID        Group  Visit  MR Delay M/F Hand  Age  EDUC  SES  MMSE  \
0    OAS2_0001  Nondemented      1         0   M    R   87    14  2.0  27.0   
1    OAS2_0001  Nondemented      2       457   M    R   88    14  2.0  30.0   
2    OAS2_0002     Demented      1         0   M    R   75    12  NaN  23.0   
3    OAS2_0002     Demented      2       560   M    R   76    12  NaN  28.0   
4    OAS2_0002     Demented      3      1895   M    R   80    12  NaN  22.0   
..         ...          ...    ...       ...  ..  ...  ...   ...  ...   ...   
368  OAS2_0185     Demented      2       842   M    R   82    16  1.0  28.0   
369  OAS2_0185     Demented      3      2297   M    R   86    16  1.0  26.0   
370  OAS2_0186  Nondemented      1         0   F    R   61    13  2.0  30.0   
371  OAS2_0186  Nondemented      2       763   F    R   63    13  2.0  30.0   
372  OAS2_0186  Nondemented      3      1608   F    R   65    13  2.0  30.0   

     CDR  eTIV   nWBV    ASF  
0    0.0  1987  0.69

In [5]:
#lets do some classification 

In [6]:
#dropping some unecessary values

In [7]:
data = data.drop(['Subject ID'], axis = 1)

In [8]:
#lets drop all null values
data.dropna()


Unnamed: 0,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
5,Nondemented,1,0,F,R,88,18,3.0,28.0,0.0,1215,0.710,1.444
6,Nondemented,2,538,F,R,90,18,3.0,27.0,0.0,1200,0.718,1.462
7,Nondemented,1,0,M,R,80,12,4.0,28.0,0.0,1689,0.712,1.039
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,Demented,2,842,M,R,82,16,1.0,28.0,0.5,1693,0.694,1.037
369,Demented,3,2297,M,R,86,16,1.0,26.0,0.5,1688,0.675,1.040
370,Nondemented,1,0,F,R,61,13,2.0,30.0,0.0,1319,0.801,1.331
371,Nondemented,2,763,F,R,63,13,2.0,30.0,0.0,1327,0.796,1.323


In [9]:
#lets try do classificatin using a decision tree! 

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#need to create target variable now 

In [12]:
y = data['Group']
x = data.drop(['Group'], axis = 1 )

In [13]:
#lets do a little data preprocessing

In [14]:
#Num and Cat pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])


cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False))
    ])


num_attribs = ['Visit', 'Age', 'EDUC', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF']
cat_attribs = ['M/F', 'Hand']

In [15]:
preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [16]:
preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])



In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 24)

In [18]:
X_train = preprocess_pipeline.fit_transform(
    X_train[num_attribs + cat_attribs])




X_test = preprocess_pipeline.fit_transform(
    X_test[num_attribs + cat_attribs])


#saving for later
X_train_copy = X_train
X_test_copy = X_test

In [19]:
# to do a little parameter tuning 
model_params = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
from sklearn.model_selection import GridSearchCV

model =GridSearchCV(DecisionTreeClassifier(), model_params, cv = 10)

In [20]:
model.fit(X_train, y_train)
model.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [21]:
y_pred = model.predict(X_test)

In [22]:
#now lets evaluate the algorithm 

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 0  3  4]
 [ 0 34  0]
 [ 1  0 33]]
              precision    recall  f1-score   support

   Converted       0.00      0.00      0.00         7
    Demented       0.92      1.00      0.96        34
 Nondemented       0.89      0.97      0.93        34

    accuracy                           0.89        75
   macro avg       0.60      0.66      0.63        75
weighted avg       0.82      0.89      0.86        75



In [23]:
# not bad! our algorithm is decent at predicting if someone is demented or nondemented - not great at converted, though. 

#a quick reminder 

#more false negatives (such as predicting an individual with dementia is non-demendtia) is associated
#with a lower RECALL SCORE 

#More False positives (saying someone wihout dementia has dementia) is associated with a 
#lower PRECISION SCORE 


