In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [7]:
import kagglehub
import os

path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")

In [8]:
files = os.listdir(path)
print("Content of", files)

csv_file = files[0]
csv_path = os.path.join(path, csv_file)

# Load DataFrame
df = pd.read_csv(csv_path)
df = df.drop(columns=["DoctorInCharge"])  # Drop useless column


# Display the content of DataFrame
df.head().T

Content of ['alzheimers_disease_data.csv']


Unnamed: 0,0,1,2,3,4
PatientID,4751.0,4752.0,4753.0,4754.0,4755.0
Age,73.0,89.0,73.0,74.0,89.0
Gender,0.0,0.0,0.0,1.0,0.0
Ethnicity,0.0,0.0,3.0,0.0,0.0
EducationLevel,2.0,0.0,1.0,1.0,0.0
BMI,22.927749,26.827681,17.795882,33.800817,20.716974
Smoking,0.0,0.0,0.0,1.0,0.0
AlcoholConsumption,13.297218,4.542524,19.555085,12.209266,18.454356
PhysicalActivity,6.327112,7.619885,7.844988,8.428001,6.310461
DietQuality,1.347214,0.518767,1.826335,7.435604,0.795498


<!-- @format -->

# First evaluation


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = np.array(df.drop(columns=["Diagnosis"]))
y = np.array(df["Diagnosis"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train_scld = scaler.fit_transform(X_train)
X_test_scld = scaler.transform(X_test)

<!-- @format -->

## Logisitc Regression


In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


clf = LogisticRegression(max_iter=10000, solver="lbfgs")
clf.fit(X_train_scld, y_train)

y_predict = clf.predict(X_test_scld)

acc = accuracy_score(y_test, y_predict)

print(f"Accuracy: {acc:.4f}")
print(clf.__class__.__name__)

Accuracy: 0.8256
LogisticRegression


<!-- @format -->

Do zrobienia

- Dodanie opcji sprawdzenia wszystkich wyników


In [78]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)


C_values = [0.01, 0.1, 1, 10, 100]

param_grid = [
    {
        "C": C_values,
        "penalty": ["l1"],
        "solver": ["liblinear", "saga"],
    },
    {
        "C": C_values,
        "penalty": ["l2"],
        "solver": ["lbfgs", "liblinear", "saga", "newton-cg", "sag"],
    },
]

# Definicja metryk
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "f1": make_scorer(
        f1_score, average="macro"
    ),  # Macro, bo to klasyfikacja wieloklasowa
    "precision": make_scorer(precision_score, average="macro"),
    "recall": make_scorer(recall_score, average="macro"),
    "roc_auc": make_scorer(
        roc_auc_score, multi_class="ovr"
    ),  # OVR dla wieloklasowej klasyfikacji
}

grid_search = GridSearchCV(
    LogisticRegression(max_iter=10000),
    param_grid,
    cv=5,
    scoring=scoring,
    refit="accuracy",
)

grid_search.fit(X_train_scld, y_train)

print(f"✅ Najlepsze parametry: {grid_search.best_params_}")
print("📊 Wyniki dla najlepszej konfiguracji:")
for metric in scoring.keys():
    print(
        f"{metric.capitalize()}: {grid_search.cv_results_['mean_test_' + metric][grid_search.best_index_]:.4f}"
    )

✅ Najlepsze parametry: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
📊 Wyniki dla najlepszej konfiguracji:
Accuracy: 0.8423
F1: 0.8237
Precision: 0.8325
Recall: 0.8175
Roc_auc: 0.8175


<!-- @format -->


<!-- @format -->

## Suppor Vector Machines


In [None]:
from sklearn import svm

svm_model = svm.SVC(kernel="linear", random_state=42)
svm_model.fit(X_train_scld, y_train)

In [60]:
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)

y_pred = svm_model.predict(X_test_scld)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.8163
