# 02 - Model Training

This notebook handles model training using XGBoost to predict disease risk from patient data.


## Imports

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='whitegrid')


## Load Preprocessed Data

In [None]:
X = pd.read_csv('../data/features.csv')
y = pd.read_csv('../data/labels.csv').squeeze()  # Ensures it's a Series, not a DataFrame

print(X.shape, y.shape)


## Split the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## Train the Model

In [None]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)


## Evaluate the Model

In [None]:
y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


## Visualize Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


## Save the model

In [None]:
import joblib
joblib.dump(model, '../models/xgboost_model.pkl')


âœ… **Next step:** Proceed to `03_model_explainability.ipynb` to visualize and interpret the model predictions using SHAP.
