In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.multioutput import MultiOutputClassifier
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import pickle

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../data/processed/dataset_processed.csv")

X = df.iloc[:, :51].drop(columns=['file_path'])
Y = df.iloc[:, 51:]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

In [3]:
print("Dataset shape: ", df.shape)
print(f"✓ Training set: {X_train.shape}")
print(f"✓ Test set: {X_test.shape}")

Dataset shape:  (3088, 64)
✓ Training set: (2470, 50)
✓ Test set: (618, 50)


In [4]:
try:
    print("✓ GPU is available and working!")
    xgb_baseline = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss', device='cuda')
except Exception as e:
    print("❌ GPU not available")
    print(f"Error: {e}")
    print("→ Falling back to CPU")
    xgb_baseline = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss')
    
multi_baseline = MultiOutputClassifier(xgb_baseline)
xgb_baseline.fit(X_train, Y_train)
print("✓ XGBoost model trained!")

✓ GPU is available and working!
✓ XGBoost model trained!


In [5]:
y_pred_baseline_train = xgb_baseline.predict(X_train)
y_pred_baseline_test = xgb_baseline.predict(X_test)

In [6]:
train_acc = accuracy_score(Y_train, y_pred_baseline_train)
test_acc = accuracy_score(Y_test, y_pred_baseline_test)

print(f"  Training Accuracy: {train_acc:.4f}")
print(f"  Test Accuracy: {test_acc:.4f}")

  Training Accuracy: 0.9996
  Test Accuracy: 0.9029


In [7]:
# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.1, 0.3],
    'n_estimators': [100, 200],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}

try:
    print("✓ GPU is available and working!")
    xgb_grid = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss', device='cuda')
except Exception as e:
    print("❌ GPU not available")
    print(f"Error: {e}")
    print("→ Falling back to CPU")
    xgb_grid = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss')

multi_multi_grid = MultiOutputClassifier(xgb_grid)

✓ GPU is available and working!


In [None]:
grid_search = GridSearchCV(
    estimator=multi_multi_grid,
    param_grid={'estimator__' + key: value for key, value in param_grid.items()},
    scoring='f1_micro',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits


In [None]:
best_params = {k.replace('estimator__', ''): v for k, v in grid_search.best_params_.items()}
print("Best Parameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

In [None]:
print("RETRAINING TUNED MODEL")
print("="*60)

try:
    print("✓ GPU is available and working!")
    xgb_final = xgb.XGBClassifier(**best_params, random_state=42, n_jobs=-1, eval_metric='logloss', device='cuda')
except Exception as e:
    print("❌ GPU not available")
    print(f"Error: {e}")
    print("→ Falling back to CPU")
    xgb_final = xgb.XGBClassifier(**best_params, random_state=42, n_jobs=-1, eval_metric='logloss')

tuned_model = MultiOutputClassifier(xgb_final)
tuned_model.fit(X_train, Y_train)

In [None]:
y_pred_tuned_train = tuned_model.predict(X_train)
y_pred_tuned_test = tuned_model.predict(X_test)

In [None]:
train_acc_tuned = accuracy_score(Y_train, y_pred_tuned_train)
test_acc_tuned = accuracy_score(Y_test, y_pred_tuned_test)

print(f"  Training Accuracy: {train_acc_tuned:.4f}")
print(f"  Test Accuracy: {test_acc_tuned:.4f}")

In [None]:
print("Classification Report:")
print("-" * 60)
print(classification_report(Y_test, y_pred_tuned_test, target_names=Y.columns, zero_division=0))

In [None]:
print("Tuned Model Confusion Matrices")
mcm_tuned = multilabel_confusion_matrix(Y_test, y_pred_tuned_test)

fig, axes = plt.subplots(3, 5, figsize=(20, 12))
axes = axes.ravel()

for i, (col, cm) in enumerate(zip(Y.columns, mcm_tuned)):
    if i < len(axes):
        sns.heatmap(cm, annot=True, fmt='d', ax=axes[i], cmap='Greens', cbar=False)
        axes[i].set_title(f'{col}')
        axes[i].set_ylabel('True')
        axes[i].set_xlabel('Predicted')

for j in range(len(Y.columns), len(axes)):
    axes[j].axis('off')

plt.suptitle('Tuned Model - Confusion Matrices', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
print("\Feature Importance Analysis")

# Extract first estimator from the tuned MultiOutputClassifier
first_estimator = tuned_model.estimators_[0]
importance = first_estimator.feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importance
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance_df.head(15).to_string(index=False))

plt.figure(figsize=(12, 8))
top_features = feature_importance_df.head(20)
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Importance Score')
plt.title('Top 20 Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
print("\n[Saving Model]")
with open('../models/dataset_xgboost_tuned.pkl', 'wb') as f:
    pickle.dump(tuned_model, f)
    
print("✓ Saved 'dataset_xgboost_tuned.pkl' (Ready for future use)")