In [1]:
#Imports
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time

In [2]:
#Load the dataset
df = pd.read_csv('complete_decimal_dataset.csv')

In [3]:
#Data Preprocessing
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

X_full = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y_full = df['specific_class_encoded']

scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

In [4]:
#Optimization Function
def optimize_random_forest(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        random_state=42,
        n_jobs=-1
    )
    
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(rf_model, X_full_scaled, y_full, cv=stratified_kfold, scoring=scorer, n_jobs=-1).mean()
    
    return scores

In [5]:
#Optuna study with stratified cross-validation
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(optimize_random_forest, n_trials=10, n_jobs=-1)

[I 2025-01-27 19:45:47,980] A new study created in memory with name: no-name-86d28f6a-8566-4270-bb45-eccedf870dc1
[I 2025-01-27 19:57:26,317] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 57, 'max_depth': 8, 'min_samples_split': 6}. Best is trial 0 with value: 1.0.
[I 2025-01-27 20:00:42,990] Trial 8 finished with value: 1.0 and parameters: {'n_estimators': 113, 'max_depth': 17, 'min_samples_split': 5}. Best is trial 0 with value: 1.0.
[I 2025-01-27 20:02:41,540] Trial 6 finished with value: 0.1549844804902303 and parameters: {'n_estimators': 178, 'max_depth': 2, 'min_samples_split': 3}. Best is trial 0 with value: 1.0.
[I 2025-01-27 20:06:26,402] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 153, 'max_depth': 32, 'min_samples_split': 6}. Best is trial 0 with value: 1.0.
[I 2025-01-27 20:11:17,270] Trial 9 finished with value: 1.0 and parameters: {'n_estimators': 135, 'max_depth': 32, 'min_samples_split': 2}. Best is trial 0 with value: 1.0.
[I 2

In [6]:
#Best parameters
print(f"Best parameters for Random Forest: {study_rf.best_params}")

Best parameters for Random Forest: {'n_estimators': 57, 'max_depth': 8, 'min_samples_split': 6}


In [7]:
#Final model training with optimized hyperparameters
best_rf = RandomForestClassifier(**study_rf.best_params, random_state=42)
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

start_time = time.time()
y_pred = cross_val_predict(best_rf, X_full_scaled, y_full, cv=stratified_kfold, n_jobs=-1)
end_time = time.time()


In [8]:
#Evaluate the time it takes to train the model
training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

Model training time: 238.20 seconds


In [9]:
report = classification_report(y_full, y_pred)
print("Classification Report:\n")
print(report)

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1223737
           1       1.00      1.00      1.00     74663
           2       1.00      1.00      1.00      9991
           3       1.00      1.00      1.00     54900
           4       1.00      1.00      1.00     24951
           5       1.00      1.00      1.00     19977

    accuracy                           1.00   1408219
   macro avg       1.00      1.00      1.00   1408219
weighted avg       1.00      1.00      1.00   1408219

