In [1]:
#Imports
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time

In [2]:
#Load the dataset
df = pd.read_csv('complete_decimal_dataset.csv')

In [3]:
#Data Preprocessing
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

X = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y = df['specific_class_encoded']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
#Optimization function
def optimize_adaboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0, log=True)

    clf = AdaBoostClassifier(
        n_estimators=n_estimators, 
        learning_rate=learning_rate,
        algorithm="SAMME",
        random_state=42
    )
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(clf, X_scaled, y, cv=kfold, scoring=scorer, n_jobs=-1).mean()
    return scores

In [5]:
#Optuna study with stratified cross-validation
study = optuna.create_study(direction='maximize')
study.optimize(optimize_adaboost, n_trials=10, n_jobs=-1)

[I 2025-01-27 20:28:58,749] A new study created in memory with name: no-name-91cb90dc-d0da-43dd-aaf2-a53cc39930b1
[I 2025-01-27 21:22:02,393] Trial 8 finished with value: 0.1549844804902303 and parameters: {'n_estimators': 56, 'learning_rate': 0.04827538542059558}. Best is trial 8 with value: 0.1549844804902303.
[I 2025-01-27 21:31:41,159] Trial 1 finished with value: 0.316185542857681 and parameters: {'n_estimators': 91, 'learning_rate': 0.868649991657305}. Best is trial 1 with value: 0.316185542857681.
[I 2025-01-27 21:33:42,120] Trial 2 finished with value: 0.1549844804902303 and parameters: {'n_estimators': 165, 'learning_rate': 0.07271337108050346}. Best is trial 1 with value: 0.316185542857681.
[I 2025-01-27 21:54:57,577] Trial 7 finished with value: 0.1549844804902303 and parameters: {'n_estimators': 202, 'learning_rate': 0.1629056548240236}. Best is trial 1 with value: 0.316185542857681.
[I 2025-01-27 21:58:03,795] Trial 0 finished with value: 0.1549844804902303 and parameters:

In [6]:
#Best parameters
print(f"Best parameters for AdaBoost: {study.best_params}")

Best parameters for AdaBoost: {'n_estimators': 91, 'learning_rate': 0.868649991657305}


In [7]:
#Final model training with optimized hyperparameters
best_params = study.best_params
optimized_adaboost = AdaBoostClassifier(**best_params, random_state=42, algorithm="SAMME")
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

start_time = time.time()
y_pred = cross_val_predict(optimized_adaboost, X_scaled, y, cv=kfold, n_jobs=-1)
end_time = time.time()


In [8]:
#Evaluate the time it takes to train the model
training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

Model training time: 495.53 seconds


In [9]:
#Classification report
report = classification_report(y, y_pred)
print(f"Classification Report:\n{report}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96   1223737
           1       0.89      1.00      0.94     74663
           2       0.00      0.00      0.00      9991
           3       0.00      0.00      0.00     54900
           4       0.00      0.00      0.00     24951
           5       0.00      0.00      0.00     19977

    accuracy                           0.92   1408219
   macro avg       0.30      0.33      0.32   1408219
weighted avg       0.85      0.92      0.88   1408219



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
