# FTML Project Exercice 5

For this exercise, we will perform a classification on the provided dataset.

## Imports and preprocessing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import optuna

from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    ExtraTreesClassifier,
    VotingClassifier
)
from sklearn.linear_model import (
    RidgeClassifier,
    LogisticRegression,
    ARDRegression,  # Not available for classification
      SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier
)
from sklearn.naive_bayes import (
    GaussianNB,
    MultinomialNB,
    BernoulliNB
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


X_train = np.load("../data/classification/X_train.npy")
X_test = np.load("../data/classification/X_test.npy")
Y_train = np.load("../data/classification/y_train.npy")
Y_test = np.load("../data/classification/y_test.npy")

# Squeeze output arrays
Y_train = np.squeeze(Y_train)
Y_test = np.squeeze(Y_test)

## Start analyzing

Just like the previous analysis, classification analysis are very experimental and it is very difficult to know what classifier would be good for a dataset. This is why we chose to try many different classifiers and see what comes out of it.

In [None]:
def try_model(classifier) :
    model = make_pipeline(
            classifier,
        )
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    a_score = accuracy_score(Y_test, Y_pred)
    print(f"Accuracy Score for classifier {classifier}: {a_score}")

def try_model_scaled(classifier) :
    model = make_pipeline(
            StandardScaler(),
            classifier,
        )
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    a_score = accuracy_score(Y_test, Y_pred)
    print(f"Accuracy Score for classifier (scaled) {classifier}: {a_score}")

In [6]:
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    GradientBoostingClassifier(),
    HistGradientBoostingClassifier(),
    ExtraTreesClassifier(),
    RidgeClassifier(),
    LogisticRegression(max_iter=1000),
    SGDClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    GaussianNB(),
    BernoulliNB(),
    KNeighborsClassifier(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    MLPClassifier(max_iter=1000),
    VotingClassifier(
    estimators=[
        ('svc', SVC(probability=True)),
        ('rf', RandomForestClassifier()),
        ('hist_gb', HistGradientBoostingClassifier()),
        ('knn', KNeighborsClassifier()),
        ('gb', GradientBoostingClassifier()),
    ],
    voting='soft'
)
]

for classifier in classifiers :
    try_model(classifier)
    try_model_scaled(classifier)
    print("")

Accuracy Score for classifier RandomForestClassifier(): 0.781
Accuracy Score for classifier (scaled) RandomForestClassifier(): 0.792

Accuracy Score for classifier AdaBoostClassifier(): 0.7355
Accuracy Score for classifier (scaled) AdaBoostClassifier(): 0.7355

Accuracy Score for classifier BaggingClassifier(): 0.752
Accuracy Score for classifier (scaled) BaggingClassifier(): 0.757

Accuracy Score for classifier GradientBoostingClassifier(): 0.76
Accuracy Score for classifier (scaled) GradientBoostingClassifier(): 0.7605

Accuracy Score for classifier HistGradientBoostingClassifier(): 0.781
Accuracy Score for classifier (scaled) HistGradientBoostingClassifier(): 0.781

Accuracy Score for classifier ExtraTreesClassifier(): 0.7835
Accuracy Score for classifier (scaled) ExtraTreesClassifier(): 0.789

Accuracy Score for classifier RidgeClassifier(): 0.7445
Accuracy Score for classifier (scaled) RidgeClassifier(): 0.7445

Accuracy Score for classifier LogisticRegression(max_iter=1000): 0.74

All methods yeld very decent scores given that we were using default parameters. These scores range from 0.59 to 0.7975! It is important to mention that we are NOT using the f1 score, but the mean accuracy score instead, as requested by the exercise.

## KNeighbors

In [None]:
scores = []
ks = list(range(1, 100))

for k in ks:
    model = KNeighborsClassifier(n_neighbors = k, weights="distance", n_jobs=-1)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    a_score = accuracy_score(Y_test, Y_pred)
    
    scores.append(a_score)

plt.plot(ks, scores)
print("best K: ", ks[np.argmax(scores)])
print("f1_score: ", max(scores))

The best score is reached at level 43

# Hyperparameter tuning with optuna

We will try to optimize the 3 best models as we did for the previous exercise.

## HistGradientBoostingClassifier

In [10]:
def objective(trial):
    
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_iter': trial.suggest_int('max_iter', 100, 1000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 15, 255),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-4, 10.0, log=True),
        'max_bins': trial.suggest_int('max_bins', 64, 255),
        'early_stopping': False,
        'random_state': 42
    }

    model = make_pipeline(HistGradientBoostingClassifier(**params))
    
    model.fit(X_train, Y_train.ravel())
    Y_pred = model.predict(X_test)
    return accuracy_score(Y_test, Y_pred)

HG_study = optuna.create_study(study_name="SVC hyperparameter optimisation", direction="maximize")
HG_study.optimize(objective, n_trials=300)

HG_study.best_params, HG_study.best_value

[I 2025-06-08 02:23:47,361] A new study created in memory with name: SVC hyperparameter optimisation
[I 2025-06-08 02:23:47,686] Trial 0 finished with value: 0.775 and parameters: {'learning_rate': 0.09892952017167873, 'max_iter': 109, 'max_leaf_nodes': 178, 'min_samples_leaf': 34, 'l2_regularization': 0.6336233594294929, 'max_bins': 113}. Best is trial 0 with value: 0.775.
[I 2025-06-08 02:23:47,848] Trial 1 finished with value: 0.7735 and parameters: {'learning_rate': 0.17955388842312128, 'max_iter': 112, 'max_leaf_nodes': 91, 'min_samples_leaf': 66, 'l2_regularization': 2.409523166432397, 'max_bins': 96}. Best is trial 0 with value: 0.775.
[I 2025-06-08 02:23:49,352] Trial 2 finished with value: 0.786 and parameters: {'learning_rate': 0.1332984468500747, 'max_iter': 780, 'max_leaf_nodes': 100, 'min_samples_leaf': 38, 'l2_regularization': 0.3085374734488011, 'max_bins': 128}. Best is trial 2 with value: 0.786.
[I 2025-06-08 02:23:49,567] Trial 3 finished with value: 0.7715 and parame

({'learning_rate': 0.06537071721477719,
  'max_iter': 969,
  'max_leaf_nodes': 182,
  'min_samples_leaf': 21,
  'l2_regularization': 0.02753735281785042,
  'max_bins': 64},
 0.7945)

### After optimisation, HistGradient reaches a very decent score nearing 0.8, which means we were able to gain about 1% precision after training.

## RamdomForestClassifier

In [7]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42
    }

    model = make_pipeline(
        # StandardScaler()
        RandomForestClassifier(**params)
    )
    
    model.fit(X_train, Y_train.ravel())
    Y_pred = model.predict(X_test)
    return accuracy_score(Y_test, Y_pred)

RF_study = optuna.create_study(study_name="Random Forest hyperparameter optimisation", direction="maximize")
RF_study.optimize(objective, n_trials=300)

RF_study.best_params, RF_study.best_value

[I 2025-06-08 01:51:58,807] A new study created in memory with name: SVC hyperparameter optimisation
[I 2025-06-08 01:52:18,132] Trial 0 finished with value: 0.735 and parameters: {'n_estimators': 472, 'max_depth': 39, 'min_samples_split': 15, 'min_samples_leaf': 20, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 0.735.
[I 2025-06-08 01:52:26,852] Trial 1 finished with value: 0.7725 and parameters: {'n_estimators': 418, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': True}. Best is trial 1 with value: 0.7725.
[I 2025-06-08 01:52:32,020] Trial 2 finished with value: 0.735 and parameters: {'n_estimators': 126, 'max_depth': 23, 'min_samples_split': 15, 'min_samples_leaf': 20, 'max_features': None, 'bootstrap': False}. Best is trial 1 with value: 0.7725.
[I 2025-06-08 01:52:48,301] Trial 3 finished with value: 0.782 and parameters: {'n_estimators': 440, 'max_depth': 24, 'min_samples_split': 9, 'min_samples_leaf': 4, 

({'n_estimators': 290,
  'max_depth': 40,
  'min_samples_split': 12,
  'min_samples_leaf': 3,
  'max_features': 'log2',
  'bootstrap': False},
 0.7995)

### Just like Hist Gradient, RandomForest peaks at a respectable 0.8, with about the same improvement. However, the optimisation process took significantly longer than HistGradient.

## SVC
SVC had the highest score by default so we will optimise this model.

Conclusion: SVC default parameters are the best we can expect.

In [8]:
def objective(trial):
    kernel = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])
    params = {
        'C': trial.suggest_float('C', 1e-3, 100, log=True),
        'kernel': kernel,
        'gamma': trial.suggest_float('gamma', 1e-4, 1.0, log=True) if kernel != 'linear' else 'auto',
        'degree': trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3,
        'probability': True,
        'random_state': 42
    }

    model = make_pipeline(
        # StandardScaler(),
        SVC(**params)
    )
    
    model.fit(X_train, Y_train.ravel())
    Y_pred = model.predict(X_test)
    return accuracy_score(Y_test, Y_pred)

SVC_study = optuna.create_study(study_name="SVC hyperparameter optimisation", direction="maximize")
SVC_study.optimize(objective, n_trials=300)

SVC_study.best_params, SVC_study.best_value

[I 2025-06-08 02:15:01,927] A new study created in memory with name: SVC hyperparameter optimisation
[I 2025-06-08 02:15:02,688] Trial 0 finished with value: 0.749 and parameters: {'kernel': 'sigmoid', 'C': 30.785907946440435, 'gamma': 0.0010490330003986642}. Best is trial 0 with value: 0.749.
[I 2025-06-08 02:15:03,318] Trial 1 finished with value: 0.586 and parameters: {'kernel': 'sigmoid', 'C': 0.37592226412192553, 'gamma': 0.03123355930649243}. Best is trial 0 with value: 0.749.
[I 2025-06-08 02:15:03,986] Trial 2 finished with value: 0.7555 and parameters: {'kernel': 'rbf', 'C': 9.925928752220011, 'gamma': 0.000415599845878747}. Best is trial 2 with value: 0.7555.
[I 2025-06-08 02:15:04,895] Trial 3 finished with value: 0.559 and parameters: {'kernel': 'rbf', 'C': 0.009418658788339933, 'gamma': 0.0023088701772461056}. Best is trial 2 with value: 0.7555.
[I 2025-06-08 02:15:05,571] Trial 4 finished with value: 0.7525 and parameters: {'kernel': 'rbf', 'C': 34.27063960228819, 'gamma'

({'kernel': 'poly',
  'C': 0.004692951967253321,
  'gamma': 0.1598562868324287,
  'degree': 3},
 0.907)

The SVC is our winner here, climbing up to an amazing 0.9 accuracy score, for a pretty short optimization time overall.

## The grand Finale

TO finish this, let us define a Voting Classifier with our 3 best models.

In [20]:
svc_params = SVC_study.best_params
svc_params['probability'] = True

model = VotingClassifier(
    estimators=[
        ('svc', SVC(**svc_params)),
        ('rf', RandomForestClassifier(**RF_study.best_params)),
        ('hist_gb', HistGradientBoostingClassifier(**HG_study.best_params)),
        # ('knn', KNeighborsClassifier(43))
    ],
    voting='soft'
)

model.fit(X_train, Y_train.ravel())
Y_pred = model.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.81

Heh... some things are just deceiving...