In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time
from xgboost import XGBClassifier
import xgboost as xgb

In [2]:
notebook_start_time = datetime.now()
print(f"Notebook started at: {notebook_start_time}")

#Load the dataset
df = pd.read_csv('complete_decimal_dataset.csv')

#Encode target variable
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

#Prepare features and target
X = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y = df['specific_class_encoded']

Notebook started at: 2025-01-20 09:21:34.214107


In [3]:
#Apply scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [4]:
#Training and testing sets Splits (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

def optimize_xgboost(trial):
    #hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0)
    
    #XGBoost model with trial parameters
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    )
    
    #5-fold stratified cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(model,X_train, y_train, cv=kfold, scoring=scorer, n_jobs=-1)
    
    return scores.mean()

In [5]:
#study for XGBoost
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(optimize_xgboost, n_trials=10, n_jobs=-1)

#best parameters
print(f"Best parameters for XGBoost: {study_xgb.best_params}")


[I 2025-01-20 09:21:36,220] A new study created in memory with name: no-name-533439d9-9d4c-44b2-9ce9-d8bf367fd0e8
[I 2025-01-20 09:31:42,291] Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 346, 'max_depth': 4, 'learning_rate': 0.08257351184613305, 'subsample': 0.820421224719867, 'colsample_bytree': 0.6442582375605579}. Best is trial 5 with value: 1.0.
[I 2025-01-20 09:42:58,601] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 158, 'max_depth': 3, 'learning_rate': 0.12508639816210868, 'subsample': 0.7527043695523635, 'colsample_bytree': 0.7081917037834583}. Best is trial 5 with value: 1.0.
[I 2025-01-20 09:43:08,817] Trial 7 finished with value: 1.0 and parameters: {'n_estimators': 444, 'max_depth': 9, 'learning_rate': 0.15145476659553136, 'subsample': 0.6099935488933788, 'colsample_bytree': 0.761153229242018}. Best is trial 5 with value: 1.0.
[I 2025-01-20 09:46:34,436] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 170, 'max_dept

Best parameters for XGBoost: {'n_estimators': 346, 'max_depth': 4, 'learning_rate': 0.08257351184613305, 'subsample': 0.820421224719867, 'colsample_bytree': 0.6442582375605579}


In [6]:
#Prepare parameters from Optuna
best_params = study_xgb.best_params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(y.unique()),
    'eval_metric': 'mlogloss',
    'random_state': 42
})

In [7]:
#Convert train and test sets to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

start_time = time.time()
#Train the model with early stopping
model = xgb.train(
    best_params,
    dtrain,
    num_boost_round=best_params.get("n_estimators"),
    evals=[(dtest, 'eval')],
    early_stopping_rounds=10,
    verbose_eval=False
)
end_time = time.time()

training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

y_pred = model.predict(dtest).astype(int)

#Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='macro')
print("F1 Score:", f1)


Parameters: { "n_estimators" } are not used.



Model training time: 105.84 seconds
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       1.00      1.00      1.00     10902
           4       1.00      1.00      1.00      4968
           5       1.00      1.00      1.00      3989

    accuracy                           1.00    281644
   macro avg       1.00      1.00      1.00    281644
weighted avg       1.00      1.00      1.00    281644

F1 Score: 1.0


In [8]:
# Log the end time
notebook_end_time = datetime.now()
print(f"Notebook ended at: {notebook_end_time}")

# Calculate the total duration
notebook_duration = notebook_end_time - notebook_start_time
print(f"Total notebook runtime: {notebook_duration}")

Notebook ended at: 2025-01-20 09:53:43.644423
Total notebook runtime: 0:32:09.430316
