In [1]:
#Imports
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time

In [2]:
#Load the dataset
df = pd.read_csv('complete_decimal_dataset.csv')

In [3]:
#Data Preprocessing
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

X_full = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y_full = df['specific_class_encoded']

scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

In [4]:
#Optimization function
def optimize_extra_trees(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 10, 50, log=True)
    
    et_model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(et_model, X_full_scaled, y_full, cv=kfold, scoring=scorer, n_jobs=-1).mean()
    
    return scores

In [5]:
#Optuna study with stratified cross-validation
study_et = optuna.create_study(direction="maximize")
study_et.optimize(optimize_extra_trees, n_trials=10, n_jobs=-1)

[I 2025-01-27 22:31:07,374] A new study created in memory with name: no-name-1cc32f8e-dc2a-4795-9f9f-5ca50d506902
[I 2025-01-27 22:52:37,572] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 50, 'max_depth': 34}. Best is trial 0 with value: 1.0.
[I 2025-01-27 23:07:23,821] Trial 1 finished with value: 0.9999904829813717 and parameters: {'n_estimators': 239, 'max_depth': 14}. Best is trial 0 with value: 1.0.
[I 2025-01-27 23:10:51,272] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 55, 'max_depth': 36}. Best is trial 0 with value: 1.0.
[I 2025-01-27 23:12:54,916] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 71, 'max_depth': 32}. Best is trial 0 with value: 1.0.
[I 2025-01-27 23:17:30,994] Trial 6 finished with value: 0.9999518198164484 and parameters: {'n_estimators': 136, 'max_depth': 11}. Best is trial 0 with value: 1.0.
[I 2025-01-27 23:18:25,393] Trial 7 finished with value: 1.0 and parameters: {'n_estimators': 167, 'max_depth

In [6]:
#Best parameters
print(f"Best parameters for ExtraTrees: {study_et.best_params}")

Best parameters for ExtraTrees: {'n_estimators': 50, 'max_depth': 34}


In [7]:
#Final model training with optimized hyperparameters
best_et = ExtraTreesClassifier(**study_et.best_params, random_state=42)

#10-fold cross-validation with predictions
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
start_time = time.time()
y_pred = cross_val_predict(best_et, X_full_scaled, y_full, cv=kfold, n_jobs=-1)
end_time = time.time()

training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

Model training time: 220.58 seconds


In [8]:
#Classification report
report = classification_report(y_full, y_pred)
print(f"ExtraTrees Classification Report (Full Training):\n{report}")

ExtraTrees Classification Report (Full Training):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1223737
           1       1.00      1.00      1.00     74663
           2       1.00      1.00      1.00      9991
           3       1.00      1.00      1.00     54900
           4       1.00      1.00      1.00     24951
           5       1.00      1.00      1.00     19977

    accuracy                           1.00   1408219
   macro avg       1.00      1.00      1.00   1408219
weighted avg       1.00      1.00      1.00   1408219

