In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score, classification_report, make_scorer, average_precision_score, precision_recall_curve

In [None]:
# Carreguem el dataset i eliminem una columna no rellevant
df = pd.read_csv("alzheimers_disease_data.csv")
alzheimer = df.drop(columns="DoctorInCharge")
alzheimer

In [None]:
# Definim els models bàsics inicials
models = {
    "SVM": make_pipeline(StandardScaler(), SVC()),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression()),
    "K-Nearest Neighbors": make_pipeline(StandardScaler(), KNeighborsClassifier())
}

In [None]:
# Selecció de les variables explicatives i la variable target
var = [c for c in alzheimer.columns if c not in ["Diagnosis", "PatientID"]]

target_variable = "Diagnosis"

X = alzheimer[var]
y = alzheimer[target_variable]

# Divisió en entrenament / test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

In [None]:
# Entrenament i mètrica bàsiques per a cada model
results = {}

for name, model in models.items():
    model.fit(X_train, y_train) 
    y_pred = model.predict(X_test) 
    
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1-score": f1}
    
    print(f"\n{name} metrics:")
    print(f"Accuracy:   {acc:.3f}")
    print(f"Precision:  {prec:.3f}")
    print(f"Recall:     {rec:.3f}")
    print(f"F1-score:   {f1:.3f}")
    
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(name)
    plt.show()

In [None]:
# Validació creuada 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

scoring = ["accuracy", "precision", "recall", "f1"]

cv_results = {}

for name, model in models.items():
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring)
    cv_results[name] = {
        "Accuracy": np.mean(scores["test_accuracy"]),
        "Precision": np.mean(scores["test_precision"]),
        "Recall": np.mean(scores["test_recall"]),
        "F1-score": np.mean(scores["test_f1"])
    }

for model_name, metrics in cv_results.items():
    print(f"\n{model_name} Cross-Validation Metrics:")
    print(f"Accuracy:  {metrics['Accuracy']:.3f}")
    print(f"Precision: {metrics['Precision']:.3f}")
    print(f"Recall:    {metrics['Recall']:.3f}")
    print(f"F1-score:  {metrics['F1-score']:.3f}")

In [None]:
# Estabilitat del F1.score amb múltiples particions
testsize = 0.2
n_proves = 30
f1_r=np.zeros((n_proves,len(models)))
model_names = list(models.keys())

for i in range(n_proves):
    for j, name in enumerate(model_names):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testsize)

        model = models[name]
        model.fit(X_train,y_train)

        y_pred=model.predict(X_test)

        f1_r[i][j] = f1_score(y_test, y_pred)


plt.figure(figsize=(15,7.5))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.boxplot(f1_r)
for i in range(len(models)):
    xderiv = (i+1)*np.ones(f1_r[:,i].shape)+(np.random.rand(n_proves,)-0.5)*0.1
    plt.plot(xderiv,f1_r[:,i],'ro',alpha=0.3)


plt.xticks(range(1, len(model_names) + 1), model_names)
plt.ylabel('F1_score', fontsize=16 )
plt.title('F1_score (sense balancejar les dades)', fontsize=18)


mean_f1 = f1_r.mean(axis=0)
std_f1 = f1_r.std(axis=0)

for name, mean, std in zip(model_names, mean_f1, std_f1):
    print(f"{name:<20} F1: {mean:.3f} ± {std:.3f}")

In [None]:
# Definició de models per a cerca d'hiperparàmetres
models = {
    "Logistic Regression": Pipeline([
        ("scale", StandardScaler()),
        ("clf", LogisticRegression())
    ]),
    
    "KNN": Pipeline([
        ("scale", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ]),
    
    "SVM": Pipeline([
        ("scale", StandardScaler()),
        ("clf", SVC())
    ]),
    
    "Random Forest": Pipeline([
        ("clf", RandomForestClassifier())
    ]),
    
    "GradientBoosting": Pipeline([
        ("clf", GradientBoostingClassifier())
    ])
}

In [None]:
# Diccionaris d'hiperparèmetres per GridSearch / RandomSearch
param_grids = {
   "Logistic Regression": {
        "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
        "clf__solver": ["lbfgs", "liblinear"],
        "clf__max_iter": [300, 500, 1000]
    },
    "KNN": {
        "clf__n_neighbors": [1, 3, 5, 7, 9, 15, 25],
        "clf__weights": ["uniform", "distance"],
        "clf__p": [1, 2]   # Manhattan / Euclidean
    },

    "SVM": {
        "clf__C": [0.01, 0.1, 1, 10, 100],
        "clf__kernel": ["linear", "rbf"],
        "clf__gamma": ["scale", "auto"]   # només s’usa amb RBF
    },

    "Random Forest": {
        "clf__n_estimators": [100, 200, 400, 700],
        "clf__max_depth": [None, 5, 10, 20],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4],
        "clf__max_features": ["sqrt", "log2"]
    },
    
    "GradientBoosting": {
        "clf__n_estimators": [100, 300, 500],
        "clf__learning_rate": [0.01, 0.05, 0.1],
        "clf__max_depth": [2, 3, 5],
        "clf__subsample": [0.6, 0.8, 1.0]
    }

}


In [None]:
# Scorer basat en F1
f1_scorer = make_scorer(f1_score)

**Grid Search**

Fem una primera cerca d’hiperparàmetres amb GridSearchCV. Aquesta funció prova totes les combinacions d’hiperparàmetres establertes al diccionari param\_grid i, per tant, és un procés molt lent. Concretament, tarda aproximadament 62 minuts a executar-se. Per aquest motiu, hem deixat el codi comentat i hem inclòs en una taula els resultats d’una execució anterior.

In [None]:
"""
best_results = []

for name, model in models.items():
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring='f1')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    start = time.time()
    grid.fit(X_train, y_train)
    end = time.time()
    
    best_results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "Train F1": f1_score(y_train, grid.predict(X_train)),
        "Test F1": f1_score(y_test, grid.predict(X_test)),
        "Time (s)": round(end - start, 2)
    })
"""

In [None]:
"""
print("Resultats del Grid Search: \n")
results_df = pd.DataFrame(best_results)
print(results_df.sort_values(by="Test F1", ascending=False).to_string(index=False))
"""

**Randomized Search**

Fem una segona cerca d’hiperparàmetres amb RandomizedSearchCV. Aquesta funció, a diferència de GridSearchCV, no prova totes les combinacions d’hiperparàmetres, sinó que selecciona combinacions aleatòries. Aquesta tècnica és més ràpida que l’anterior, però continua tenint un temps d’execució força elevat. Per aquest motiu, també hem deixat el codi comentat.

In [None]:
"""best_results = []

for name, model in models.items():
    grid = RandomizedSearchCV(model, param_grids[name], cv=5, scoring='f1')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    start = time.time()
    grid.fit(X_train, y_train)
    end = time.time()
    
    best_results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "Train F1": f1_score(y_train, grid.predict(X_train)),
        "Test F1": f1_score(y_test, grid.predict(X_test)),
        "Time (s)": round(end - start, 2)
    })"""

In [None]:
"""print("Resultats del Grid Search: \n")
results_df = pd.DataFrame(best_results)
print(results_df.sort_values(by="Test F1", ascending=False).to_string(index=False))"""

Resultat de la cerca d'hiperparametres amb Grid Search

| Model               | Best Params                                                                                                                                    | Train F1  | Test F1  | Time (s) |
|---------------------|------------------------------------------------------------------------------------------------------------------------------------------------|----------|----------|----------|
| GradientBoosting    | {'clf_learning_rate': 0.01, 'clfmax_depth': 2, 'clfn_estimators': 500, 'clf_subsample': 1.0}                                             | 0.940883 | 0.914676 | 1223.48  |
| Random Forest       | {'clf_max_depth': 20, 'clfmax_features': 'log2', 'clfmin_samples_leaf': 1, 'clfmin_samples_split': 2, 'clf_n_estimators': 400}        | 1.000000 | 0.884211 | 2463.35  |
| SVM                 | {'clf_C': 10, 'clfgamma': 'scale', 'clf_kernel': 'linear'}                                                                                 | 0.788660 | 0.726027 | 72.01    |
| Logistic Regression | {'clf_C': 0.001, 'clfmax_iter': 300, 'clf_solver': 'liblinear'}                                                                            | 0.778063 | 0.710345 | 1.84     |
| KNN                 | {'clf_n_neighbors': 9, 'clfp': 1, 'clf_weights': 'uniform'}                                                                                | 0.716763 | 0.516393 | 1.87     |

Una primera selecció d'hiperparametres (RandomizedSearch)

| Model               | Best Params                                                                                                                                                               | Train F1  | Test F1  | Time (s) |
|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|----------|----------|
| GradientBoosting    | {'clf__subsample': 0.8, 'clf__n_estimators': 100, 'clf__max_depth': 5, 'clf__learning_rate': 0.05}                                                                        | 0.975894 | 0.905405 | 174.62   |
| Random Forest       | {'clf__n_estimators': 400, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__max_depth': 20}                                   | 0.994200 | 0.891986 | 87.46    |
| SVM                 | {'clf__kernel': 'linear', 'clf__gamma': 'auto', 'clf__C': 100}                                                                                                            | 0.789338 | 0.726027 | 69.68    |
| Logistic Regression | {'clf__solver': 'liblinear', 'clf__max_iter': 500, 'clf__C': 0.001}                                                                                                       | 0.778063 | 0.710345 | 0.52     |
| KNN                 | {'clf__weights': 'uniform', 'clf__p': 1, 'clf__n_neighbors': 9}                                                                                                           | 0.716763 | 0.516393 | 0.79     |


In [None]:
# Models entrenats amb hiperparàmetres òptims trobats
models = {
    "Logistic Regression": make_pipeline(StandardScaler(), 
                                    LogisticRegression(solver='liblinear', max_iter=300, C=0.001)),
    "KNN": make_pipeline(StandardScaler(), 
                    KNeighborsClassifier(weights='uniform', p=1, n_neighbors=9)),
    "SVM": make_pipeline(StandardScaler(),
                    SVC(probability=True, kernel='linear', gamma='scale', C=10)),
    "Random Forest": RandomForestClassifier(n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features='log2',max_depth=20),
    "GradientBoosting": GradientBoostingClassifier(subsample=0.6, n_estimators=500, max_depth=2, learning_rate=0.01)
}

In [None]:
# Cross Validation amb els models optimitzats
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

scoring = ["accuracy", "precision", "recall", "f1"]

cv_results = {}

for name, model in models.items():
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring)
    cv_results[name] = {
        "Accuracy": np.mean(scores["test_accuracy"]),
        "Precision": np.mean(scores["test_precision"]),
        "Recall": np.mean(scores["test_recall"]),
        "F1-score": np.mean(scores["test_f1"])
    }

for model_name, metrics in cv_results.items():
    print(f"\n{model_name} Cross-Validation Metrics:")
    print(f"Accuracy:  {metrics['Accuracy']:.3f}")
    print(f"Precision: {metrics['Precision']:.3f}")
    print(f"Recall:    {metrics['Recall']:.3f}")
    print(f"F1-score:  {metrics['F1-score']:.3f}")

In [None]:
# Estabilitat del F1-score després de l'optimització
testsize = 0.2
n_proves = 30
f1_r=np.zeros((n_proves,len(models)))
model_names = list(models.keys())

for i in range(n_proves):
    for j, name in enumerate(model_names):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testsize)

        model = models[name]
        model.fit(X_train,y_train)

        y_pred=model.predict(X_test)

        f1_r[i][j] = f1_score(y_test, y_pred)


plt.figure(figsize=(15,7.5))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.boxplot(f1_r)
for i in range(len(models)):
    xderiv = (i+1)*np.ones(f1_r[:,i].shape)+(np.random.rand(n_proves,)-0.5)*0.1
    plt.plot(xderiv,f1_r[:,i],'ro',alpha=0.3)


plt.xticks(range(1, len(model_names) + 1), model_names)
plt.ylabel('F1_score', fontsize=16  )
plt.title('F1_score (sense balancejar les dades)',fontsize=18)


mean_f1 = f1_r.mean(axis=0)
std_f1 = f1_r.std(axis=0)

for name, mean, std in zip(model_names, mean_f1, std_f1):
    print(f"{name:<20} F1: {mean:.3f} ± {std:.3f}")


In [None]:
# Corbes ROC
plt.figure()
lw = 2

print(f"ROC Curve (AUC-ROC):")
for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]  

    auc_roc = roc_auc_score(y_test, y_pred_proba)
    fpr, tpr, ths = roc_curve(y_test, y_pred_proba)

    plt.plot(fpr, tpr, label=f"{name} (AUC={auc_roc:.3f})", lw=lw)
    print(f"\t{name}: {auc_roc:.4f}")


plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

In [None]:
# Precision-Recall Curve
plt.figure()
lw = 2

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]  

    auc_pr = average_precision_score(y_test, y_pred_proba)
    prec, recall, ths = precision_recall_curve(y_test, y_pred_proba)

    plt.plot(recall, prec, label=f"{name} (AUC={auc_pr:.3f})", lw=lw)


prop = np.mean(y_test)
plt.plot([0, 1], [prop, prop], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision Recall Curve (AUC-PR: {auc_pr:.4f})")
plt.legend()
plt.show()

In [None]:
# Importància de les variables segons GradientBoosting i RandomForest
gb_model = GradientBoostingClassifier(
    subsample=1.0,
    n_estimators=100,
    max_depth=2,
    learning_rate=0.05
)

gb_model.fit(X, y)

importances = gb_model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

print("Importància de les variables Gradient Bossting:")
for idx in indices:
    print(f"{feature_names[idx]:<25} {importances[idx]:.4f}")


rf_model = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    max_depth=None
)

rf_model.fit(X, y)

importances = rf_model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

print("\nImportància de les variables Random Forest")
for idx in indices:
    print(f'{feature_names[idx]:<25} {importances[idx]:.4f}')