# XGBoost Classification for Student Performance
This notebook predicts student performance based on gameplay event data using XGBoost.
Data Source: [Kaggle Competition](https://www.kaggle.com/competitions/predict-student-performance-from-game-play)

## 1. Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")


## 2. Load and Inspect Data

In [None]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Basic data overview
print("Train shape:", train.shape)
print(train.head())
print(train['correct'].value_counts(normalize=True))


## 3. Feature Engineering

In [None]:
# Aggregate session-level features
agg_train = train.groupby("session_id").agg({
    "elapsed_time": ["mean", "max", "std"],
    "event_code": "nunique",
    "level_group": "nunique"
})

# Flatten column names
agg_train.columns = ["_".join(col).strip() for col in agg_train.columns.values]
agg_train = agg_train.reset_index()

# Get target labels
labels = train.groupby("session_id")["correct"].first().reset_index()
agg_train = pd.merge(agg_train, labels, on="session_id")

# One-hot encode level_group if needed (skipped here since aggregation dropped it)


## 4. Prepare Train/Test Data

In [None]:
X = agg_train.drop(["session_id", "correct"], axis=1)
y = agg_train["correct"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## 5. Train XGBoost Model

In [None]:
model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss"
)
model.fit(X_train, y_train)


## 6. Evaluate Model

In [None]:
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred))

# Confusion matrix
ConfusionMatrixDisplay.from_estimator(model, X_val, y_val)
plt.title("Confusion Matrix")
plt.show()


## 7. Feature Importance

In [None]:
xgb.plot_importance(model)
plt.title("Feature Importance")
plt.show()


## 8. Predict on Test Set (Optional Submission Format)

In [None]:
# Note: This assumes test set has same structure; adapt if needed
# test_agg = similar aggregation on test set
# test_preds = model.predict(test_agg.drop("session_id", axis=1))
# submission = pd.DataFrame({"session_id": test_agg["session_id"], "correct": test_preds})
# submission.to_csv("submission.csv", index=False)
