In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from sklearn.metrics import mean_squared_error
from sklearn.metrics import ndcg_score
import xgboost as xgb
import joblib

ROOT = Path.cwd().parents[0]
DATA = ROOT / "models"
OUT  = ROOT / "models"
OUT.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA / "train_core.csv")
test  = pd.read_csv(DATA / "test_core.csv")

print(f"Dataset sizes: train={len(train)}, test={len(test)}")

Dataset sizes: train=54890, test=110


In [2]:
required = {"student_id", "interests", "core_program", "program_match"}
for name, df in [("train", train), ("test", test)]:
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"[{name}] missing columns: {missing}")
    df["student_id"]  = df["student_id"].astype(str)
    df["core_program"]  = df["core_program"].astype(str)
    df["interests"]   = df["interests"].fillna("")
    df["program_match"] = pd.to_numeric(df["program_match"], errors="coerce").fillna(0.0)

In [3]:
def to_list(s: str):
    return [x.strip().lower() for x in str(s).split(";") if x.strip()]

mlb_int = MultiLabelBinarizer(sparse_output=True)
mlb_tag = MultiLabelBinarizer(sparse_output=True)

X_train = hstack([
    mlb_int.fit_transform(train["interests"].map(to_list)),
    mlb_tag.fit_transform(train["core_program"].map(to_list))
], format="csr")

X_test = hstack([
    mlb_int.transform(test["interests"].map(to_list)),
    mlb_tag.transform(test["core_program"].map(to_list))
], format="csr")

y_train = train["program_match"].values
y_test  = test["program_match"].values

print("XGBoost feature shapes:", X_train.shape, X_test.shape)

XGBoost feature shapes: (54890, 35) (110, 35)


In [4]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 10,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "nthread": -1,
    "seed": 42
}

print("Training XGBoostRegressor ...")
evals = [(dtrain, "train"), (dtest, "test")]
model = xgb.train(params, dtrain, num_boost_round=300, evals=evals, verbose_eval=50)


Training XGBoostRegressor ...
[0]	train-rmse:0.11777	test-rmse:0.12644
[50]	train-rmse:0.02316	test-rmse:0.02156
[100]	train-rmse:0.01406	test-rmse:0.01522
[150]	train-rmse:0.01075	test-rmse:0.01300
[200]	train-rmse:0.00895	test-rmse:0.01162
[250]	train-rmse:0.00770	test-rmse:0.01035
[299]	train-rmse:0.00683	test-rmse:0.00925


In [5]:
def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

test_pred = model.predict(dtest)
test_rmse = rmse(y_test, test_pred)
print(f"[TEST] RMSE = {test_rmse:.4f}")

[TEST] RMSE = 0.0092


In [6]:
joblib.dump(model, OUT / "xgb_program_match_regressor.pkl")
print(f"Model saved: {OUT / 'xgb_program_match_regressor.pkl'}")

out_df = test.copy()
out_df["pred_program_match"] = np.round(test_pred, 4)
out_df.to_csv(OUT / "xgb_core_test.csv", index=False)
print("Predicted results saved (xgb_core_test.csv).")

Model saved: /Users/zyh/Desktop/HDG_group/models/xgb_program_match_regressor.pkl
Predicted results saved (xgb_core_test.csv).


In [7]:

val_df = test[["student_id", "program_match"]].copy()
val_df["pred_program_match"] = np.asarray(test_pred, dtype=float)

rows = []
for sid, g in val_df.groupby("student_id", sort=True):
    y_true = g["program_match"].to_numpy().reshape(1, -1)
    y_pred = g["pred_program_match"].to_numpy().reshape(1, -1)
    rows.append({"student_id": sid, "nDCG@3": float(ndcg_score(y_true, y_pred, k=3))})

ndcg_XGB = pd.DataFrame(rows).sort_values("student_id").reset_index(drop=True)
mean_ndcg_XGB = float(ndcg_XGB["nDCG@3"].mean())

display(ndcg_XGB)
print("Mean nDCG@3", round(mean_ndcg_XGB, 6))

Unnamed: 0,student_id,nDCG@3
0,1285,0.983341
1,1508,1.0
2,229,0.990713
3,2297,1.0
4,2500,0.996954
5,2537,1.0
6,2826,1.0
7,327,1.0
8,3275,1.0
9,3940,1.0


Mean nDCG@3 0.997101
