Gabriel Ferreira Lima

## Imports

In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_curve, auc, recall_score, precision_score, roc_auc_score, accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Get the data

In [8]:
df_alzheimers = pd.read_csv('alzheimers_disease_data.csv')
# DoctorInCharge is confidential
# PatientID is a shit column
df_alzheimers = df_alzheimers.drop(columns=['DoctorInCharge', 'PatientID'])

In [9]:
df_alzheimers

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,6.045039,0,0,0.014691,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,7.535540,...,0.238667,0,0,4.492838,1,0,0,0,0,1
2145,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,8.555256,...,8.687480,0,1,9.204952,0,0,0,0,0,1
2146,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,5.769464,...,1.972137,0,0,5.036334,0,0,0,0,0,1
2147,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,8.322874,...,5.173891,0,0,3.785399,0,0,0,0,1,1


## Get the model and do GridSearch

In [10]:
model = SVC(class_weight='balanced', probability = True)

param_grid_svm = {'C': [0.1, 1, 10, 100],
                  'kernel': ['linear', 'rbf', 'sigmoid']
                  }

model = GridSearchCV(estimator = model, param_grid = param_grid_svm, cv=5)

## Train the model and predict

In [11]:
df_alzheimers = df_alzheimers.sample(frac=1).reset_index(drop=True)

In [12]:
X = df_alzheimers.drop(columns = 'Diagnosis')
y = df_alzheimers['Diagnosis'].to_numpy()

results = pd.DataFrame(columns = ['f1_score', 'recall', 'precision', 'accuracy', 'roc_auc'])
for _ in range(10):

  X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size = 0.8)

  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  f1_score_data = f1_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  precision = precision_score(y_test, y_pred, average='weighted')
  accuracy = accuracy_score(y_test, y_pred)

  y_pred_proba = model.predict_proba(X_test)
  roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])

  results.loc[len(results)] = [f1_score_data, recall, precision, accuracy, roc_auc]

results = results.agg(['mean', 'std']).round(2)

In [13]:
results

Unnamed: 0,f1_score,recall,precision,accuracy,roc_auc
mean,0.83,0.83,0.83,0.83,0.9
std,0.02,0.02,0.02,0.02,0.02
