In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time

In [2]:
notebook_start_time = datetime.now()
print(f"Notebook started at: {notebook_start_time}")
df = pd.read_csv('complete_decimal_dataset.csv')

Notebook started at: 2025-01-21 16:05:45.267963


In [3]:
#Encoding target variable
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

#Preparing features and target
X_full = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y_full = df['specific_class_encoded']

#Scaling features
scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

In [4]:
#Training and testing sets Splits (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_full_scaled, y_full, test_size=0.2, random_state=42)

#Optimization function for ExtraTrees
def optimize_extra_trees(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 10, 50, log=True)
    
    clf = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(clf, X_train, y_train, cv=kfold, scoring=scorer, n_jobs=-1)

    
    return scores.mean()

In [5]:
#hyperparameters optimization using Optuna
study_et = optuna.create_study(direction="maximize")
study_et.optimize(optimize_extra_trees, n_trials=10, n_jobs=-1)

[I 2025-01-21 16:05:47,030] A new study created in memory with name: no-name-17c71f4f-a5aa-425b-a39c-f93bd3696346
[I 2025-01-21 16:12:09,153] Trial 6 finished with value: 0.9999485626547765 and parameters: {'n_estimators': 142, 'max_depth': 11}. Best is trial 6 with value: 0.9999485626547765.
[I 2025-01-21 16:16:19,703] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 94, 'max_depth': 16}. Best is trial 3 with value: 1.0.
[I 2025-01-21 16:24:40,230] Trial 1 finished with value: 0.9999485626547765 and parameters: {'n_estimators': 292, 'max_depth': 11}. Best is trial 3 with value: 1.0.
[I 2025-01-21 16:26:48,680] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 184, 'max_depth': 45}. Best is trial 3 with value: 1.0.
[I 2025-01-21 16:27:34,733] Trial 0 finished with value: 0.9848411367251817 and parameters: {'n_estimators': 75, 'max_depth': 10}. Best is trial 3 with value: 1.0.
[I 2025-01-21 16:33:10,328] Trial 7 finished with value: 1.0 and parameters: {

In [6]:
print(f"Best parameters for ExtraTrees: {study_et.best_params}")

Best parameters for ExtraTrees: {'n_estimators': 94, 'max_depth': 16}


In [7]:
#Model with the best parameters from Optuna
best_et = ExtraTreesClassifier(**study_et.best_params, random_state=42)

In [8]:
start_time = time.time()
best_et.fit(X_train, y_train)
end_time = time.time()

training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

y_pred = best_et.predict(X_test)

#Classification report on test data
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Model training time: 46.53 seconds
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       1.00      1.00      1.00     10902
           4       1.00      1.00      1.00      4968
           5       1.00      1.00      1.00      3989

    accuracy                           1.00    281644
   macro avg       1.00      1.00      1.00    281644
weighted avg       1.00      1.00      1.00    281644



In [9]:
# Log the end time
notebook_end_time = datetime.now()
print(f"Notebook ended at: {notebook_end_time}")

# Calculate the total duration
notebook_duration = notebook_end_time - notebook_start_time
print(f"Total notebook runtime: {notebook_duration}")

Notebook ended at: 2025-01-21 16:36:54.630344
Total notebook runtime: 0:31:09.362381


In [10]:
from sklearn.metrics import classification_report,accuracy_score, make_scorer, precision_score, recall_score, f1_score

#Evaluate on training set
y_train_pred = best_et.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average="macro")

#Evaluate on test set
y_test_pred = best_et.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average="macro")


#Compare results
print(f"Training Accuracy: {train_accuracy:.2f}, Test Accuracy: {test_accuracy:.2f}")
print(f"Training F1 Score: {train_f1:.2f}, Test F1 Score: {test_f1:.2f}")

if train_accuracy > test_accuracy:
    print("The model might be overfitting.")
elif train_accuracy < test_accuracy:
    print("Test set performs better than training.")
else:
    print("Training and test performance are comparable.")

#Classification reports
print("\nClassification Report (Training):")
print(classification_report(y_train, y_train_pred))

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 1.00, Test Accuracy: 1.00
Training F1 Score: 1.00, Test F1 Score: 1.00
Training and test performance are comparable.

Classification Report (Training):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    978947
           1       1.00      1.00      1.00     59642
           2       1.00      1.00      1.00      8017
           3       1.00      1.00      1.00     43998
           4       1.00      1.00      1.00     19983
           5       1.00      1.00      1.00     15988

    accuracy                           1.00   1126575
   macro avg       1.00      1.00      1.00   1126575
weighted avg       1.00      1.00      1.00   1126575

Classification Report (Test):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       1.00      1.00      