# 05 - Advanced Models

**Goal:** Train Random Forest and XGBoost on the new dataset.
**Output:** Metrics saved to `results/new_dataset_analysis/metrics`.

In [1]:
import pandas as pd
import json
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

PROCESSED_DIR = Path('../data/processed/new_analysis')
METRICS_DIR = Path('../results/new_dataset_analysis/metrics')
METRICS_DIR.mkdir(parents=True, exist_ok=True)

print("Loading Data...")
X_train = pd.read_csv(PROCESSED_DIR / 'X_train_scaled.csv')
y_train = pd.read_csv(PROCESSED_DIR / 'y_train.csv').squeeze()
X_test = pd.read_csv(PROCESSED_DIR / 'X_test_scaled.csv')
y_test = pd.read_csv(PROCESSED_DIR / 'y_test.csv').squeeze()
print("Data Loaded.")

Loading Data...
Data Loaded.


In [2]:
results = {}

# 1. Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, max_depth=15, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

results['RandomForest'] = {
    'report': classification_report(y_test, y_pred_rf, output_dict=True),
    'roc_auc': roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])
}

# 2. XGBoost
print("Training XGBoost...")
scale_pos = (len(y_train) - y_train.sum()) / y_train.sum()
xgb = XGBClassifier(n_estimators=100, max_depth=6, scale_pos_weight=scale_pos, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

results['XGBoost'] = {
    'report': classification_report(y_test, y_pred_xgb, output_dict=True),
    'roc_auc': roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1])
}

with open(METRICS_DIR / '05_advanced_models_detailed_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Advanced Results Saved.")

Training Random Forest...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Advanced Results Saved.
