In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Add SRC to sys.path
sys.path.append("/Users/nedim/Desktop/MY_REPOS/mids-w207-section1-team1-finalproject/src")

from data_prep_dropmissing import load_and_prepare_data_dropmissing

import xgboost as xgb
print("XGBoost Version:", xgb.__version__)

XGBoost Version: 1.7.6


In [2]:
csv_path = "../data/external/FPA_FOD_Plus.csv"

print("Loading data (drop missing + SMOTENC)...")
X_train, Y_train, X_val, Y_val, X_test, Y_test = load_and_prepare_data_dropmissing(
    csv_path,
    smote_strategy="smotenc",    # oversampling for imbalance
    sample_size=500_000          # restricting sample size
)

print("\nShapes:")
print("Train:", X_train.shape, Y_train.shape)
print("Val  :", X_val.shape, Y_val.shape)
print("Test :", X_test.shape, Y_test.shape)


Loading data (drop missing + SMOTENC)...

Loading with Polars...
âž¡ Loaded: (2302521, 308)

ðŸ§¹ Dropping rows with ANY missing value...
âž¡ Removed 1,312,209 rows â€” Remaining: 990,312
âœ‚ Training down-sampled âžœ (500000, 45)

Applying SMOTENC...

Shapes:
Train: (1948208, 48) (1948208,)
Val  : (198063, 48) (198063,)
Test : (198063, 48) (198063,)


In [3]:
model_xgb = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    objective="multi:softprob",
    num_class=4,
    tree_method="hist",     # faster on my cpu
    eval_metric="mlogloss",
    random_state=42
)

print("\nXGBoost model created.")


XGBoost model created.


In [4]:
print("\nTraining XGBoost with Early Stopping...\n")

eval_set = [(X_train, Y_train), (X_val, Y_val)]

model_xgb.fit(
    X_train, Y_train,
    eval_set=eval_set,
    early_stopping_rounds=15,
    verbose=True
)

print("\nTraining Complete")


Training XGBoost with Early Stopping...





[0]	validation_0-mlogloss:1.34706	validation_1-mlogloss:1.33588
[1]	validation_0-mlogloss:1.31051	validation_1-mlogloss:1.29099
[2]	validation_0-mlogloss:1.27600	validation_1-mlogloss:1.24832
[3]	validation_0-mlogloss:1.24400	validation_1-mlogloss:1.21004
[4]	validation_0-mlogloss:1.21266	validation_1-mlogloss:1.17379
[5]	validation_0-mlogloss:1.18328	validation_1-mlogloss:1.13937
[6]	validation_0-mlogloss:1.15734	validation_1-mlogloss:1.10820
[7]	validation_0-mlogloss:1.13186	validation_1-mlogloss:1.07920
[8]	validation_0-mlogloss:1.10811	validation_1-mlogloss:1.05194
[9]	validation_0-mlogloss:1.08507	validation_1-mlogloss:1.02772
[10]	validation_0-mlogloss:1.06340	validation_1-mlogloss:1.00403
[11]	validation_0-mlogloss:1.04256	validation_1-mlogloss:0.98147
[12]	validation_0-mlogloss:1.02236	validation_1-mlogloss:0.96009
[13]	validation_0-mlogloss:1.00315	validation_1-mlogloss:0.93882
[14]	validation_0-mlogloss:0.98441	validation_1-mlogloss:0.91943
[15]	validation_0-mlogloss:0.96609	

In [5]:
print("\nEvaluating on Validation Set...\n")

preds = np.argmax(model_xgb.predict_proba(X_val), axis=1)

print("Accuracy:", accuracy_score(Y_val, preds))
print("\nClassification Report:\n", classification_report(Y_val, preds))
print("\nConfusion Matrix:\n", confusion_matrix(Y_val, preds))


Evaluating on Validation Set...

Accuracy: 0.9323599056865745

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.97    192935
           1       0.12      0.21      0.15      4768
           2       0.02      0.13      0.04       282
           3       0.02      0.09      0.03        78

    accuracy                           0.93    198063
   macro avg       0.28      0.35      0.30    198063
weighted avg       0.96      0.93      0.94    198063


Confusion Matrix:
 [[183609   7675   1298    353]
 [  3565   1013    159     31]
 [   189     48     37      8]
 [    56      8      7      7]]
