In [13]:
#Imports
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time
from xgboost import XGBClassifier
import xgboost as xgb
import numpy as np
from xgboost.callback import EarlyStopping

In [2]:
#Load the dataset
df = pd.read_csv('complete_decimal_dataset.csv')

In [3]:
#Data Preprocessing
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

X = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y = df['specific_class_encoded']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
#Optimization Function
def optimize_xgboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0)
    
    #XGBoost model with trial parameters
    xgboost = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42,
        n_jobs=-1
    )
    
    #5-fold CV
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(xgboost, X_scaled, y, cv=kfold, scoring=scorer, n_jobs=-1).mean()
    
    return scores

In [5]:
#Optuna study with stratified cross-validation
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(optimize_xgboost, n_trials=10, n_jobs=-1)

#Best parameters
print(f"Best parameters for XGBoost: {study_xgb.best_params}")


[I 2025-01-27 17:49:06,644] A new study created in memory with name: no-name-6f539236-546e-4a7e-b4b2-1bcd87a9eb47
[I 2025-01-27 18:05:40,989] Trial 6 finished with value: 1.0 and parameters: {'n_estimators': 338, 'max_depth': 6, 'learning_rate': 0.12486607179583395, 'subsample': 0.8398820455022838, 'colsample_bytree': 0.8885000829639664}. Best is trial 6 with value: 1.0.
[I 2025-01-27 18:11:11,636] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 232, 'max_depth': 6, 'learning_rate': 0.05109384543150596, 'subsample': 0.7451503942767111, 'colsample_bytree': 0.6688872459787522}. Best is trial 6 with value: 1.0.
[I 2025-01-27 18:19:24,364] Trial 0 finished with value: 0.9999518198164484 and parameters: {'n_estimators': 160, 'max_depth': 10, 'learning_rate': 0.017788940342930517, 'subsample': 0.6819764651416846, 'colsample_bytree': 0.7241590597216233}. Best is trial 6 with value: 1.0.
[I 2025-01-27 18:25:50,664] Trial 7 finished with value: 1.0 and parameters: {'n_estimato

Best parameters for XGBoost: {'n_estimators': 338, 'max_depth': 6, 'learning_rate': 0.12486607179583395, 'subsample': 0.8398820455022838, 'colsample_bytree': 0.8885000829639664}


In [6]:
#Prepare parameters from Optuna
best_params = study_xgb.best_params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(y.unique()),
    'eval_metric': 'mlogloss',
    'random_state': 42
})

In [18]:
#Final model training with optimized hyperparameters

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#Initialize predictions
y_pred = np.zeros(len(y))
start_time = time.time()

#Manual cross-validation
for train_idx, val_idx in stratified_kfold.split(X_scaled, y):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Convert data to DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Train model with early stopping
    xgboost = xgb.train(
        best_params,
        dtrain,
        num_boost_round=best_params.get("n_estimators"),
        evals=[(dval, 'eval')],
        early_stopping_rounds=10,
        verbose_eval=False
)
       
    #Predictions for the validation set
    y_pred[val_idx] = xgboost.predict(dval)


end_time = time.time()


Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.



In [19]:
#Evaluate the time it takes to train the model
training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

Model training time: 1532.28 seconds


In [20]:
#Evaluate the model
print("Classification Report:\n", classification_report(y, y_pred))
f1 = f1_score(y, y_pred, average='macro')
print("F1 Score:", f1)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1223737
           1       1.00      1.00      1.00     74663
           2       1.00      1.00      1.00      9991
           3       1.00      1.00      1.00     54900
           4       1.00      1.00      1.00     24951
           5       1.00      1.00      1.00     19977

    accuracy                           1.00   1408219
   macro avg       1.00      1.00      1.00   1408219
weighted avg       1.00      1.00      1.00   1408219

F1 Score: 1.0
