In [1]:
#Imports
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score, make_scorer, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time

In [2]:
#Load Dataset
df = pd.read_csv('complete_decimal_dataset.csv')

In [3]:
#Data Preprocessing
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

X_full = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y_full = df['specific_class_encoded']

scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

In [4]:
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_full_scaled, y_full, test_size=0.2, random_state=42)

In [5]:
#Optimization Function
def optimize_random_forest(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        random_state=42,
        n_jobs=-1
    )

    #5-Fold CV
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(rf_model, X_train, y_train, cv=stratified_kfold, scoring=scorer, n_jobs=-1).mean()
    return scores


In [6]:
#Optuna study with stratified cross-validation
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(optimize_random_forest, n_trials=10)

[I 2025-01-27 17:00:33,976] A new study created in memory with name: no-name-7ace95d7-0736-49c7-a7b1-50b74fda3190
[I 2025-01-27 17:03:59,623] Trial 0 finished with value: 0.9999801739325107 and parameters: {'n_estimators': 135, 'max_depth': 27, 'min_samples_split': 9}. Best is trial 0 with value: 0.9999801739325107.
[I 2025-01-27 17:06:10,909] Trial 1 finished with value: 0.8347057732161935 and parameters: {'n_estimators': 95, 'max_depth': 4, 'min_samples_split': 7}. Best is trial 0 with value: 0.9999801739325107.
[I 2025-01-27 17:08:15,024] Trial 2 finished with value: 0.9999485626547765 and parameters: {'n_estimators': 88, 'max_depth': 5, 'min_samples_split': 9}. Best is trial 0 with value: 0.9999801739325107.
[I 2025-01-27 17:11:01,704] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 112, 'max_depth': 25, 'min_samples_split': 2}. Best is trial 3 with value: 1.0.
[I 2025-01-27 17:14:38,708] Trial 4 finished with value: 0.4119650045953261 and parameters: {'n_estimato

In [7]:
#Best hyperparameters
print(f"Best parameters for Random Forest: {study_rf.best_params}")

Best parameters for Random Forest: {'n_estimators': 112, 'max_depth': 25, 'min_samples_split': 2}


In [9]:
#Final model training on the training set with optimized hyperparameters
best_params = study_rf.best_params
final_model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42
)

start_time = time.time()
#Train
final_model.fit(X_train, y_train)
end_time = time.time()


In [10]:
#Evaluate the time it takes to train the model
training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")


Model training time: 120.16 seconds


In [11]:
#Evaluate on training set
y_train_pred = final_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average="macro")

#Evaluate on test set
y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average="macro")


#Compare results
print(f"Training Accuracy: {train_accuracy:.2f}, Test Accuracy: {test_accuracy:.2f}")
print(f"Training F1 Score: {train_f1:.2f}, Test F1 Score: {test_f1:.2f}")

if train_accuracy > test_accuracy:
    print("The model might be overfitting.")
elif train_accuracy < test_accuracy:
    print("Test set performs better than training.")
else:
    print("Training and test performance are comparable.")

#Classification reports
print("\nClassification Report (Training):")
print(classification_report(y_train, y_train_pred))

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 1.00, Test Accuracy: 1.00
Training F1 Score: 1.00, Test F1 Score: 1.00
Training and test performance are comparable.

Classification Report (Training):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    978947
           1       1.00      1.00      1.00     59642
           2       1.00      1.00      1.00      8017
           3       1.00      1.00      1.00     43998
           4       1.00      1.00      1.00     19983
           5       1.00      1.00      1.00     15988

    accuracy                           1.00   1126575
   macro avg       1.00      1.00      1.00   1126575
weighted avg       1.00      1.00      1.00   1126575

Classification Report (Test):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       1.00      1.00      