In [2]:
# Day5_XGBoost_Baseline.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier, plot_importance
import matplotlib.pyplot as plt
import joblib

# ---------- Load Data ----------
data = pd.read_csv("../data/cleaned_resumes.csv")

print("Dataset shape:", data.shape)
print("Columns:", data.columns.tolist())

# ---------- Features & Labels ----------
X = data.drop(columns=["hire_label", "candidate_id"])
y = data["hire_label"]

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

print("Feature shape after encoding:", X.shape)

# ---------- Train-Test Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ---------- Train XGBoost ----------
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=1,
    use_label_encoder=False
)

xgb.fit(X_train, y_train)

# ---------- Predict on test ----------
y_pred = xgb.predict(X_test)

# ---------- Evaluation ----------
print("\n=== XGBoost Model Performance ===")
print("Accuracy :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ---------- Feature Importance ----------
plt.figure(figsize=(10,6))
plot_importance(xgb, max_num_features=10, importance_type="weight")
plt.title("Top 10 Feature Importances (XGBoost)")
plt.show()

# ---------- Save Model ----------
joblib.dump(xgb, "../models/xgb_baseline.pkl")
print("Model saved at ../models/xgb_baseline.pkl")


Dataset shape: (2000, 10)
Columns: ['candidate_id', 'gender', 'ethnicity', 'education', 'years_experience', 'skills', 'label', 'skills_list', 'num_skills', 'skills_text']


KeyError: "['hire_label'] not found in axis"