# 2. Model Training (Production)
This notebook loads the processed data and trains the final **XGBoost Classifier** using the optimal hyperparameters found during Phase 1 optimization.\n**Model Accuracy Target**: ~73.8%

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
# 1. Load Processed Data
data_path = "../data/processed/final_dataset.csv"
df = pd.read_csv(data_path)

X = df.drop("cardio", axis=1)
y = df["cardio"]

# Standardize (Optional for XGBoost but good practice)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training Samples: {X_train.shape[0]}")
print(f"Test Samples:     {X_test.shape[0]}")

Training Samples: 55869
Test Samples:     13968


In [3]:
# 2. Define Champion Model (Phase 1 Winner)
# These parameters were validated to give ~73.78% accuracy with no overfitting.

params = {
    'n_estimators': 300,
    'learning_rate': 0.05,
    'max_depth': 5,
    'min_child_weight': 3,
    'gamma': 0.5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'logloss',
    'random_state': 42,
    'use_label_encoder': False
}

model = XGBClassifier(**params)
print("Training Model...")
model.fit(X_train, y_train)
print("Training Complete.")

Training Model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Complete.


In [4]:
# 3. Validation
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\nFINAL TEST ACCURACY: {acc*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


FINAL TEST ACCURACY: 73.15%

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.76      0.74      6990
           1       0.74      0.71      0.72      6978

    accuracy                           0.73     13968
   macro avg       0.73      0.73      0.73     13968
weighted avg       0.73      0.73      0.73     13968



In [5]:
# 4. Save Model Artifact
models_dir = "../models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

model_path = os.path.join(models_dir, "xgboost_champion.json")
model.save_model(model_path)
print(f"Model saved to: {model_path}")

Model saved to: ../models/xgboost_champion.json
