# FTML Project

In [21]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
import optuna

from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    ExtraTreesClassifier
)
from sklearn.linear_model import (
    RidgeClassifier,
    LogisticRegression,
    ARDRegression,  # Not available for classification
      SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier,
    VotingClassifier
)
from sklearn.naive_bayes import (
    GaussianNB,
    MultinomialNB,
    BernoulliNB
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


X_train = np.load("../data/classification/X_train.npy")
X_test = np.load("../data/classification/X_test.npy")
Y_train = np.load("../data/classification/y_train.npy")
Y_test = np.load("../data/classification/y_test.npy")

# Squeeze output arrays
Y_train = np.squeeze(Y_train)
Y_test = np.squeeze(Y_test)

ImportError: cannot import name 'VotingClassifier' from 'sklearn.linear_model' (/home/alex/afs/Master/FTML_project/venv/lib/python3.13/site-packages/sklearn/linear_model/__init__.py)

In [18]:
def try_model(classifier) :
    model = make_pipeline(
            #PCA(n_components=0.1),
            classifier,
        )
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    f1 = f1_score(Y_test, Y_pred)
    print(f"Score for regressor {classifier}: {f1}")

In [20]:
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    GradientBoostingClassifier(),
    HistGradientBoostingClassifier(),
    ExtraTreesClassifier(),
    RidgeClassifier(),
    LogisticRegression(max_iter=1000),
    SGDClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    GaussianNB(),
    BernoulliNB(),
    KNeighborsClassifier(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    MLPClassifier(max_iter=1000),
    VotingClassifier(
    estimators=[
        ('svc', SVC(probability=True)),
        ('rf', RandomForestClassifier()),
        ('hist_gb', HistGradientBoostingClassifier()),
        ('knn', KNeighborsClassifier()),
        ('gb', GradientBoostingClassifier()),
    ],
    voting='soft'  # soft voting uses predicted probabilities
)
]

for classifier in classifiers :
    try_model(classifier)

NameError: name 'VotingClassifier' is not defined

## KNeighbors

In [None]:
scores = []
ks = list(range(1, 100))

for k in ks:
    model = KNeighborsClassifier(n_neighbors = k, weights="distance", n_jobs=-1)
    model.fit(X_train, Y_train)
    score = model.score(X_test, Y_test)
    
    scores.append(score)

plt.plot(ks, scores)
print("best K: ", ks[np.argmax(scores)])
print("score: ", max(scores))

## LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(X_train, Y_train)
score = model.score(X_test, Y_test)
print("score: ", score)

## SVC

In [None]:
model = make_pipeline(StandardScaler(), SVC())
model.fit(X_train_scaled, Y_train)
score = model.score(X_test_scaled, Y_test)

print("score: ", score)

## MLPClassifier

In [None]:
model = MLPClassifier(max_iter=500, early_stopping=True)
model.fit(X_train, Y_train)
score = model.score(X_test, Y_test)
print("score: ", score)

## AdaBoostClassifier

## GradientBoostingClassifier

In [None]:
model = GradientBoostingClassifier()
model.fit(X_train, Y_train)
score = model.score(X_test, Y_test)
print("score: ", score)

In [None]:
model = AdaBoostClassifier()
model.fit(X_train, Y_train)
score = model.score(X_test, Y_test)
print("score: ", score)

# Hyperparameter tuning with optuna


## SVC
SVC had the highest score by default so we will optimise this model.

Conclusion: SVC default parameters are the best we can expect.

In [None]:
def objective(trial):
    C = trial.suggest_float('C', 0, 1)
    degree = trial.suggest_int('degree', 1, 10)
    kernel = trial.suggest_categorical('kernel',['linear', 'poly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_categorical('gamma',['scale', 'auto'])
    
    
    model = make_pipeline(StandardScaler(), SVC(C=C, degree=degree, kernel=kernel, gamma=gamma))
    model.fit(X_train, Y_train)
    score = model.score(X_test, Y_test)
    return score

study = optuna.create_study(study_name="SVC hyperparameter optimisation v2", direction="maximize")
study.optimize(objective, n_trials=300)

study.best_params

## GradientBoostingClassifier

In [None]:
def objective(trial):
    subsample = trial.suggest_float('subsample', 0, 1)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
    n_estimators = trial.suggest_int('n_estimators', 1, 300)
    loss = trial.suggest_categorical('loss',['log_loss', 'exponential'])
    
    
    model = GradientBoostingClassifier(subsample=subsample, learning_rate=learning_rate, n_estimators=n_estimators, loss=loss)
    model.fit(X_train, Y_train)
    score = model.score(X_test, Y_test)
    return score

study = optuna.create_study(study_name="GradientBoostingClassifier hyperparameter optimisation ", direction="maximize")
study.optimize(objective, n_trials=300)

study.best_params