In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict, train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time

In [2]:
notebook_start_time = datetime.now()
print(f"Notebook started at: {notebook_start_time}")

#Decimal Dataset
df = pd.read_csv('complete_decimal_dataset.csv')

Notebook started at: 2025-01-19 17:42:54.406728


In [3]:
#Encoding target variable
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

#Preparing features and target
X = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y = df['specific_class_encoded']

#Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
#Training and testing sets Splits (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Optimization function for AdaBoost
def optimize_adaboost(trial):
    # Hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0, log=True)

    clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  
    #scores = cross_val_score(clf, X_train, y_train, cv=kfold, scoring='accuracy', n_jobs=-1)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(clf, X_train, y_train, cv=kfold, scoring=scorer, n_jobs=-1)

    return scores.mean()
            

In [5]:
#AdaBoost hyperparameters optimization using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(optimize_adaboost, n_trials=10, n_jobs=-1)

[I 2025-01-19 17:42:56,354] A new study created in memory with name: no-name-c6df9de9-5ae8-47e9-a43a-e865a6472a0f
[I 2025-01-19 18:14:23,599] Trial 9 finished with value: 0.5872203250271897 and parameters: {'n_estimators': 55, 'learning_rate': 0.14440915076113775}. Best is trial 9 with value: 0.5872203250271897.
[I 2025-01-19 18:35:50,704] Trial 8 finished with value: 0.7411036966675759 and parameters: {'n_estimators': 344, 'learning_rate': 0.07088026608515903}. Best is trial 8 with value: 0.7411036966675759.
[I 2025-01-19 18:39:40,377] Trial 0 finished with value: 0.3934949992002785 and parameters: {'n_estimators': 322, 'learning_rate': 0.017243093905740182}. Best is trial 8 with value: 0.7411036966675759.
[I 2025-01-19 18:51:51,087] Trial 1 finished with value: 0.8605197572935183 and parameters: {'n_estimators': 152, 'learning_rate': 0.7049253235310126}. Best is trial 1 with value: 0.8605197572935183.
[I 2025-01-19 19:01:55,748] Trial 7 finished with value: 0.8960872887784042 and par

In [6]:
print(f"Best parameters for AdaBoost: {study.best_params}")

Best parameters for AdaBoost: {'n_estimators': 169, 'learning_rate': 0.22031826639728547}


In [7]:
#Train and evaluate optimized AdaBoost model
#best_params = study.best_params
#optimized_adaboost = AdaBoostClassifier(**best_params, random_state=42)

In [8]:
#best model training on the entire training set and evaluate on the test set
best_params = study.best_params

final_model = AdaBoostClassifier(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    random_state=42
)
start_time = time.time()
final_model.fit(X_train, y_train)
end_time = time.time()

training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

y_pred = final_model.predict(X_test)

#Classification report on test data
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))



Model training time: 191.41 seconds
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       1.00      0.91      0.95     10902
           4       0.84      1.00      0.91      4968
           5       0.00      0.00      0.00      3989

    accuracy                           0.98    281644
   macro avg       0.80      0.82      0.81    281644
weighted avg       0.97      0.98      0.98    281644



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Log the end time
notebook_end_time = datetime.now()
print(f"Notebook ended at: {notebook_end_time}")

# Calculate the total duration
notebook_duration = notebook_end_time - notebook_start_time
print(f"Total notebook runtime: {notebook_duration}")


Notebook ended at: 2025-01-19 19:19:56.546362
Total notebook runtime: 1:37:02.139634


In [11]:
from sklearn.metrics import classification_report,accuracy_score, make_scorer, precision_score, recall_score, f1_score

#Evaluate on training set
y_train_pred = final_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average="macro")

#Evaluate on test set
y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average="macro")


#Compare results
print(f"Training Accuracy: {train_accuracy:.2f}, Test Accuracy: {test_accuracy:.2f}")
print(f"Training F1 Score: {train_f1:.2f}, Test F1 Score: {test_f1:.2f}")

if train_accuracy > test_accuracy:
    print("The model might be overfitting.")
elif train_accuracy < test_accuracy:
    print("Test set performs better than training.")
else:
    print("Training and test performance are comparable.")

#Classification reports
print("\nClassification Report (Training):")
print(classification_report(y_train, y_train_pred))

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 0.98, Test Accuracy: 0.98
Training F1 Score: 0.81, Test F1 Score: 0.81
Test set performs better than training.

Classification Report (Training):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99    978947
           1       1.00      1.00      1.00     59642
           2       1.00      1.00      1.00      8017
           3       1.00      0.91      0.95     43998
           4       0.83      1.00      0.91     19983
           5       0.00      0.00      0.00     15988

    accuracy                           0.98   1126575
   macro avg       0.80      0.82      0.81   1126575
weighted avg       0.97      0.98      0.98   1126575

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       1.00      0.91      0.95     10902
           4       0.84      1.00      0.91      4968
           5       0.00      0.00      0.00      3989

    accuracy                           0.98    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
