# Download from API

In [1]:
!pip install kaggle



In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
from google.colab import files

files.upload() #this will prompt you to upload the kaggle.json



Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"","key":""}'}

In [5]:
!ls -lha kaggle.json

-rw-r--r-- 1 root root 24 Aug 19 12:32 kaggle.json


In [6]:
!pip install -q kaggle

In [7]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/


In [8]:
!chmod 600 /root/.kaggle/kaggle.json

In [9]:
!pwd

/content


In [10]:
!kaggle datasets list

401 - Unauthorized - Unauthenticated


In [11]:
!kaggle datasets download -d rabieelkharoua/alzheimers-disease-dataset

Dataset URL: https://www.kaggle.com/datasets/rabieelkharoua/alzheimers-disease-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading alzheimers-disease-dataset.zip to /content
  0% 0.00/268k [00:00<?, ?B/s]
100% 268k/268k [00:00<00:00, 69.1MB/s]


In [12]:
!unzip /content/alzheimers-disease-dataset.zip

Archive:  /content/alzheimers-disease-dataset.zip
  inflating: alzheimers_disease_data.csv  


# Data Analysis

In [139]:
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict

In [80]:
from sklearn.metrics import classification_report

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [64]:
# Import necessary libraries from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [23]:
data = pd.read_csv('alzheimers_disease_data.csv')

In [24]:
print(data['Diagnosis'].unique())

[0 1]


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [26]:
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [27]:
dataclean = data.drop(data.columns[range(23, 35)], axis=1)
dataclean


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,Diabetes,Depression,HeadInjury,Hypertension,SystolicBP,DiastolicBP,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,1,1,0,0,142,72,242.366840,56.150897,33.682563,162.189143
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,0,0,115,64,231.162595,193.407996,79.028477,294.630909
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,0,0,99,116,284.181858,153.322762,69.772292,83.638324
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,0,0,0,118,115,159.582240,65.366637,68.457491,277.577358
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0,0,94,117,237.602184,92.869700,56.874305,291.198780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,0,0,122,101,280.476824,94.870490,60.943092,234.520123
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,0,0,0,152,106,186.384436,95.410700,93.649735,367.986877
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,0,0,115,118,237.024558,156.267294,99.678209,294.802338
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,0,0,103,96,242.197192,52.482961,81.281111,145.253746


In [28]:
dataclean.isna().sum()

Unnamed: 0,0
PatientID,0
Age,0
Gender,0
Ethnicity,0
EducationLevel,0
BMI,0
Smoking,0
AlcoholConsumption,0
PhysicalActivity,0
DietQuality,0


# Machine Learning Models

## Min-Max Scaler

The **Min-Max Scaler** scales features to a specified range, typically [0, 1].

1. **Standardized Formula:**

   $$
   X_{\text{std}} = \frac{X - \min(X)}{\max(X) - \min(X)}
   $$

2. **Scaled Formula:**

   $$
   X_{\text{scaled}} = X_{\text{std}} \times (\text{max} - \text{min}) + \text{min}
   $$

---

## StandardScaler

The **StandardScaler** standardizes features by removing the mean and scaling to unit variance.

1. **Standardization Formula:**

   $$
   X_{\text{scaled}} = \frac{X - \mu}{\sigma}
   $$

In [48]:
# Initialize scalers
scalers = {
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler()
}

In [153]:
#Define Models
models = {
    'LinearSVC': LinearSVC(dual=False), #(number of samples is much larger than the number of features (23 features and 2149 samples)
    'KNeighborsClassifier' : KNeighborsClassifier(),
    'SVC': SVC( random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'DecisionTree Classifier': DecisionTreeClassifier(random_state=42),
    'Random Forest Classifier': RandomForestClassifier(random_state=42)
}

In [154]:
def evaluate_models(X, y, scaler, models, k_folds=5):
    results = {}
    reports = {}

    # Scale the data
    X_scaled = scaler.fit_transform(X)

    for name, ml_model in models.items():
        print(f'Current Machine Learning Model: {name}')

        # Calculate cross-validation scores
        scores = cross_val_score(ml_model, X_scaled, y, cv=k_folds)
        results[name] = scores

        # Compute predictions for classification report
        predictions = cross_val_predict(ml_model, X_scaled, y, cv=k_folds)
        report = classification_report(y, predictions)
        reports[name] = report

        print(f"Cross-Validation Accuracy Scores: {scores}")
        print(f"Mean Accuracy: {np.mean(scores)*100:.4f}%")
        print(f"Classification Report:\n{report}")
        print("-"*50)

    return reports

In [155]:
X = dataclean
y = data['Diagnosis']

In [156]:
X.shape, y.shape

((2149, 23), (2149,))

In [157]:
# Evaluate models with MinMaxScaler
print("Evaluating with MinMaxScaler")
minmax_report = evaluate_models(X, y, scalers['MinMaxScaler'], models)

Evaluating with MinMaxScaler
Current Machine Learning Model: LinearSVC
Cross-Validation Accuracy Scores: [0.64651163 0.64186047 0.64651163 0.65116279 0.62703963]
Mean Accuracy: 64.2617%
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.98      0.78      1389
           1       0.39      0.02      0.04       760

    accuracy                           0.64      2149
   macro avg       0.52      0.50      0.41      2149
weighted avg       0.56      0.64      0.52      2149

--------------------------------------------------
Current Machine Learning Model: KNeighborsClassifier
Cross-Validation Accuracy Scores: [0.57674419 0.61395349 0.58837209 0.59069767 0.58741259]
Mean Accuracy: 59.1436%
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.79      0.72      1389
           1       0.37      0.22      0.28       760

    accuracy                           0.59      2149
   m

In [158]:
# Evaluate models with StandardScaler
print("Evaluating with StandardScaler")
standard_report = evaluate_models(X, y, scalers['StandardScaler'], models)

Evaluating with StandardScaler
Current Machine Learning Model: LinearSVC
Cross-Validation Accuracy Scores: [0.64651163 0.64186047 0.64651163 0.65116279 0.62703963]
Mean Accuracy: 64.2617%
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.98      0.78      1389
           1       0.39      0.02      0.04       760

    accuracy                           0.64      2149
   macro avg       0.52      0.50      0.41      2149
weighted avg       0.56      0.64      0.52      2149

--------------------------------------------------
Current Machine Learning Model: KNeighborsClassifier
Cross-Validation Accuracy Scores: [0.59069767 0.61627907 0.58837209 0.59069767 0.57808858]
Mean Accuracy: 59.2827%
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.79      0.72      1389
           1       0.38      0.23      0.29       760

    accuracy                           0.59      2149
  