# Predicting Student Exam Scores  
### Kaggle Playground Series S6E1

The goal is to predict students' exam scores using tabular data. 

Models used:
- XGBoost (tree-based gradient boosting)
- TabM (deep learning model for tabular data)

In [1]:
# Load training and test datasets
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from typing import List

train = pd.read_csv("/kaggle/input/playground-series-s6e1/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s6e1/test.csv")

In [None]:
# Load training and test datasets
orig = pd.read_csv("/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv")
print("Original data shape:", orig.shape )
orig.head(1)

In [None]:
print("Train Shape: ", train.shape)
train.head()

In [None]:
def plot_features_dual_axis(
    df: pd.DataFrame, cols: List[str], target_col: str, n_bins: int=10, n_wide: int=3, figsize_per_plot: tuple[float,float] = (5,4), int_as_cat_unique_max: int | None = 20, cat_order: dict[str,list] | None = None,
):
    BAR_COLOR = "tab:blue"
    LINE_COLOR = "tab:orange"

    def is_categorical(s: pd.Series) -> bool:
        return s.dtype == "object" or pd.api.types.is_string_dtype(s)

    n_cols = len(cols)
    n_rows = int(np.ceil(n_cols / n_wide))

    fig, axes = plt.subplots(n_rows, n_wide, figsize=(figsize_per_plot[0] * n_wide, figsize_per_plot[1] * n_rows),)

    axes = np.atleast_1d(axes).ravel()

    for i, col in enumerate(cols):
        ax1 = axes[i]
        col_safe = col.replace("_", r"\_")

        n_nan = df[col].isna().sum()
        n_unique = df[col].nunique(dropna=True)

        tmp = df[[col, target_col]].dropna()
        if tmp.empty:
            ax1.set_title(
                rf"$\bf{{{col_safe}}}$"
                f"\n(empty after dropna; {n_unique} unique, {n_nan} nan)"
            )
            continue

        x = tmp[col]
        y = tmp[target_col]

        # True Categorical
        if is_categorical(x):
            type_str = "categorical"

            counts = x.value_counts()
            mean_y = tmp.groupby(col)[target_col].mean()

            if cat_order is not None and col in cat_order:
                desired = list(cat_order[col])
                ordered = [c for c in desired if c in counts.index]
                remaining = [c for c in counts.index if c not in ordered]
                final_order = ordered + remaining
            else:
                final_order = sorted(counts.index)

            counts = counts.loc[final_order]
            mean_y = mean_y.loc[final_order]

            xpos = np.arange(len(final_order))

            ax1.bar(xpos, counts.values, alpha=0.6, color=BAR_COLOR)
            ax1.set_xlabel(col)
            ax1.set_ylabel("Count", color = BAR_COLOR)
            ax1.tick_params(axis="y", colors = BAR_COLOR)
            ax1.set_xticks(xpos)
            ax1.set_xticklabels(final_order, rotation=45, ha='right')

            ax2 = ax1.twinx()
            ax2.plot(xpos, mean_y.values, marker="o", color=LINE_COLOR)
            ax2.set_ylabel(f"Mean {target_col}", color = LINE_COLOR)
            ax2.tick_params(axis="y", colors=LINE_COLOR)

            ax1.set_title(
                rf"$\bf{{{col_safe}}}$: Count vs Mean {target_col}"
                f"\n({type_str} with {n_unique} unique and {n_nan} nan)"
            )

        else:
            type_str = "numeric"

            xvals = x.values
            yvals = y.values

            mask = np.isfinite(xvals) &  np.isfinite(yvals)

            xvals = xvals[mask]
            yvals = yvals[mask]

            if len(xvals) == 0:
                ax1.set_title(
                    rf"$\bf{{{col_safe}}}$"
                    f"\n({type_str} with {n_unique} unique and {n_nan} nan)"
                )
                continue

            unique_vals = np.sort(np.unique(xvals))
            n_unique_eff = len(unique_vals)

            # --- Case 1: int-as-categorical ---
            if int_as_cat_unique_max is not None and n_unique_eff <= int_as_cat_unique_max:
                counts = np.array([(xvals == v).sum() for v in unique_vals])
                mean_y = np.array([yvals[xvals == v].mean() for v in unique_vals])

                xpos = np.arange(n_unique_eff)

                ax1.bar(xpos, counts, alpha=0.6, color=BAR_COLOR)
                ax1.set_xlabel(col)
                ax1.set_ylabel("Count", color=BAR_COLOR)
                ax1.tick_params(axis="y", colors=BAR_COLOR)

                rotate = 45 if n_unique_eff > n_bins else 0
                step = max(int(np.ceil(n_unique_eff / n_bins)), 1)
                tick_idx = np.arange(0, n_unique_eff, step)

                if pd.api.types.is_integer_dtype(x):
                    tick_labels = unique_vals[tick_idx].astype(int)
                else:
                    tick_labels = unique_vals[tick_idx]

                ax1.set_xticks(tick_idx)
                ax1.set_xticklabels(
                    tick_labels,
                    rotation=rotate,
                    ha="right" if rotate else "center",
                )

                ax2 = ax1.twinx()
                ax2.plot(xpos, mean_y, marker="o", color=LINE_COLOR)
                ax2.set_ylabel(f"Mean {target_col}", color=LINE_COLOR)
                ax2.tick_params(axis="y", colors=LINE_COLOR)

                ax1.set_title(
                    rf"$\bf{{{col_safe}}}$: Per-Value Count vs Mean {target_col}"
                    f"\n({type_str} with {n_unique} unique and {n_nan} nan)"
                )

            # --- Case 2: low-cardinality numeric bins ---
            elif n_unique_eff < n_bins:
                counts = np.array([(xvals == v).sum() for v in unique_vals])
                mean_y = np.array([yvals[xvals == v].mean() for v in unique_vals])

                width = 0.8 * (np.min(np.diff(unique_vals)) if n_unique_eff > 1 else 1.0)

                ax1.bar(unique_vals, counts, width=width, alpha=0.6, color=BAR_COLOR)
                ax1.set_xlabel(col)
                ax1.set_ylabel("Count", color=BAR_COLOR)
                ax1.tick_params(axis="y", colors=BAR_COLOR)

                ax2 = ax1.twinx()
                ax2.plot(unique_vals, mean_y, marker="o", color=LINE_COLOR)
                ax2.set_ylabel(f"Mean {target_col}", color=LINE_COLOR)
                ax2.tick_params(axis="y", colors=LINE_COLOR)

                ax1.set_title(
                    rf"$\bf{{{col_safe}}}$: Per-Value Count vs Mean {target_col}"
                    f"\n({type_str} with {n_unique} unique and {n_nan} nan)"
                )

            # --- Case 3: regular histogram ---
            else:
                bins = np.linspace(xvals.min(), xvals.max(), n_bins + 1)
                bin_centers = 0.5 * (bins[:-1] + bins[1:])

                counts, _ = np.histogram(xvals, bins=bins)
                bin_idx = np.digitize(xvals, bins) - 1

                mean_y = np.array([
                    yvals[bin_idx == j].mean() if np.any(bin_idx == j) else np.nan
                    for j in range(n_bins)
                ])

                ax1.bar(
                    bin_centers,
                    counts,
                    width=(bins[1] - bins[0]),
                    alpha=0.6,
                    color=BAR_COLOR,
                )
                ax1.set_xlabel(col)
                ax1.set_ylabel("Count", color=BAR_COLOR)
                ax1.tick_params(axis="y", colors=BAR_COLOR)

                ax2 = ax1.twinx()
                ax2.plot(bin_centers, mean_y, marker="o", color=LINE_COLOR)
                ax2.set_ylabel(f"Mean {target_col}", color=LINE_COLOR)
                ax2.tick_params(axis="y", colors=LINE_COLOR)

                ax1.set_title(
                    rf"$\bf{{{col_safe}}}$: Histogram vs Mean {target_col}"
                    f"\n({type_str} with {n_unique} unique and {n_nan} nan)"
                )

    for j in range(i + 1, len(axes)):
        axes[j].axis("off")

    plt.tight_layout()
    plt.show()

In [None]:
FEATURES = list(train.columns[1:-1])
print(f"There are {len(FEATURES)} features")
print(FEATURES)

In [None]:
ordinal_order = {"sleep_quality":["poor","average","good"],
    "facility_rating":["low","medium","high"],
    "exam_difficulty":["easy","moderate","hard"],}

plot_features_dual_axis(
    train,
    cols=FEATURES,
    target_col="exam_score",
    n_bins=10,
    n_wide=3,
    int_as_cat_unique_max=20,
    cat_order = ordinal_order,
)

In [None]:
# !pip install pytabkit

# XGBoost with Pseudo Labels

In [None]:
# Set up cross-validation strategy
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from xgboost import plot_importance

import torch
from pytabkit import TabM_D_Regressor

In [None]:
# Train XGBoost model
import xgboost as xgb
print(f"XGBoost version: {xgb.__version__}")

import importlib.metadata
print(f"TabM version: {importlib.metadata.version('pytabkit')}")

In [None]:
train.head(1)

In [None]:
TARGET = "exam_score"
CATS = ["gender","course","internet_access","study_method","sleep_quality","facility_rating","exam_difficulty"]
FEATURES = ['age', 'gender', 'course', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']
print(f"There are {len(FEATURES)} features.")

In [None]:
def formula(df):
    f = (6*df.study_hours + 0.35*df.class_attendance + 1.5*df.sleep_hours +
                 5*(df.sleep_quality=='good') + -5*(df.sleep_quality=='poor') +
                 10*(df.study_method=='coaching') + 5*(df.study_method=='mixed') + 2*(df.study_method=='group study') + 1*(df.study_method=='online videos') +
                 4*(df.facility_rating=='high') + -4*(df.facility_rating=='low') )
    return f

for df in [train, test, orig]:
    df['formula'] = formula(df)

In [None]:
ordinal_maps = {
    "gender" : {"male":0, "female":1, "other":2},
    "internet_access" : {"no":0, "yes":1},
    "sleep_quality" : {"poor":0, "average":1, "good":2},
    "facility_rating" : {"low":0, "medium":1, "high":2},
    "exam_difficulty" : {"easy":0, "moderate":1, "hard":2},
    "course" : {"ba":0, "b.sc":1, "diploma":2, "b.tech":3, "b.com":4, "bca":5, "bba":6},
    "study_method" : {"self-study":0, "online videos":1, "group study":2, "mixed":3, "coaching":4},
}

for df in [train, test, orig]:
    for c in CATS:
        df[c] = df[c].map(ordinal_maps[c]).fillna(-1).astype('int32')

In [None]:
NEW_CATS = []
for c in FEATURES:
    n = f"CAT_{c}"
    NEW_CATS.append(n)

    train[n] = train[c].copy()
    test[n] = test[c].copy()
    orig[n] = orig[c].copy()

    combine = pd.concat([train[n], test[n], orig[n]], axis=0)
    v, _ = combine.factorize()
    train[n] = v[:len(train)]
    test[n] = v[len(train):len(train)+len(test)]
    orig[n] = v[len(train)+len(test):]
    for df in [train, test, orig]:
        df[n] = df[n].astype("int32")

FEATURES += ['formula'] + NEW_CATS
print(f"Now there are {len(FEATURES)} features.")

In [None]:
# Train XGBoost model
xgb_params = {
    "n_estimators": 10_000,
    "learning_rate": 0.01,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.35,
    "min_child_weight": 5,
    "early_stopping_rounds": 100,
    "eval_metric": "rmse",
    "enable_categorical": True,
    "device": "cuda",
    "random_state": 42
}

tabm_params = {
    "verbosity": 0,
    "arch_type": "tabm-mini-normal",
    "tabm_k": 24, 
    "num_emb_type": "pwl", 
    "d_embedding": 8,
    "batch_size": 256,
    "lr": 1e-3, 
    "n_epochs": 100, 
    "dropout": 0.11, 
    "d_block": 256, 
    "n_blocks": 5, 
    "patience": 4, 
    "weight_decay": 1e-2, 
    "device": "cuda",
    "random_state": 42,
}

In [None]:
# Set up cross-validation strategy
N_SPLITS = 10

X = train[FEATURES]
X_test = test[FEATURES]
X_orig = orig[FEATURES]

y = train[TARGET].values
y_orig = orig[TARGET].values

oof_preds_xgb = np.zeros(len(train))
test_preds_xgb = np.zeros(len(test))

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state = 42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    print("#" * 25)
    print(f"### Fold {fold}")
    print("#" * 25)

    X_train, X_val = X.iloc[train_idx], X.iloc[ val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    X_train = pd.concat([X_train,X_orig],axis=0)
    y_train = np.concatenate([y_train,y_orig],axis=0)\

    model = TabM_D_Regressor(**tabm_params)
    model.fit(X_train, y_train, X_val, y_val, cat_col_names = NEW_CATS,)
    
    oof_preds = model.predict(X_val)
    test_preds = model.predict(X_test)

    X_train_xgb = pd.concat([X_train,X_val,X_test],axis=0)
    y_train_xgb = np.concatenate([y_train,oof_preds,test_preds],axis=0)

    for c in NEW_CATS:
        X_train_xgb[c] = X_train_xgb[c].astype('category')
    X_val_xgb = X_train_xgb.iloc[len(X_train):len(X_train)+len(X_val)]
    X_test_xgb = X_train_xgb.iloc[len(X_train)+len(X_val):]

    # TRAIN XGB WITH PSEUDO LABELS
    model = XGBRegressor(**xgb_params)
    model.fit(
        X_train_xgb,
        y_train_xgb,
        eval_set=[(X_val_xgb, y_val)],
        verbose=200,
    )
    oof_preds = model.predict(X_val_xgb)
    oof_preds_xgb[val_idx] = oof_preds

    rmse_xgb = mean_squared_error(y_val, oof_preds)**0.5
    print(f"Fold {fold} RMSE XGB w/ Pseudo: {rmse_xgb:.5f}")

    test_preds = model.predict(X_test_xgb)
    test_preds_xgb += test_preds/N_SPLITS

    # CLEAR MEMORY
    gc.collect()
    torch.cuda.empty_cache()

# OVERALL CV SCORE
print("-" * 40)
rmse_xgb = mean_squared_error(y, oof_preds_xgb)**0.5
print(f"OOF RMSE XGB w/ Pseudo:       {rmse_xgb:.5f}")
print()

In [None]:
# Train XGBoost model
fig, ax = plt.subplots(figsize=(8,8))
plot_importance(
    model, ax=ax, importance_type="gain", max_num_features=40, height=0.5
)
ax.set_title("XGB Feature Importance")
fig.tight_layout()
plt.show()

In [None]:
# Train XGBoost model
rmse_xgb = mean_squared_error(y, oof_preds_xgb)**0.5
print(f"OOF RMSE XGB w/ Pseudo:       {rmse_xgb:.5f}")


In [None]:
# Train XGBoost model
np.save("oof_xgb",oof_preds_xgb)
np.save("test_preds_xgb",test_preds_xgb)


# Create Submission CSV

In [None]:
# Load training and test datasets
sub = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")
sub['exam_score'] = np.clip(test_preds_xgb,19.6,100)
sub.to_csv("submission.csv",index=False)
sub.head()


In [None]:
# Train XGBoost model
plt.hist(sub['exam_score'],bins=100)
plt.title("XGB w/ Pseudo Test Preds")
plt.show()


## Final Notes

- XGBoost is used as a strong tree-based baseline for tabular data.
- TabM is included to capture smoother non-linear feature interactions.
- These models often complement each other well in ensembles.

This notebook is structured to be understandable for:
- Kaggle participants
- Interviewers
- Anyone learning applied machine learning on tabular data
