In [None]:
# Import paths
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from sklearn.metrics import ndcg_score
import xgboost as xgb
import joblib

ROOT = Path.cwd().parents[0]
DATA = ROOT / "models"
OUT  = ROOT / "models"
OUT.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA / "train_program.csv")
test  = pd.read_csv(DATA / "test_program.csv")

print(f"Dataset sizes: train={len(train)}, test={len(test)}")

Dataset sizes: train=233158, test=99592


In [None]:
# Feature engineering: one-hot on interests and field_tags
def to_list(s: str):
    return [x.strip().lower() for x in str(s).split(";") if x.strip()]

mlb_int = MultiLabelBinarizer(sparse_output=True)
mlb_tag = MultiLabelBinarizer(sparse_output=True)

X_train = hstack([
    mlb_int.fit_transform(train["interests"].map(to_list)),
    mlb_tag.fit_transform(train["field_tags"].map(to_list))
], format="csr")

X_test = hstack([
    mlb_int.transform(test["interests"].map(to_list)),
    mlb_tag.transform(test["field_tags"].map(to_list))
], format="csr")

y_train = train["label_match"].values
y_test  = test["label_match"].values

print("size", X_train.shape, X_test.shape)

XGBoost feature shapes: (233158, 48) (99592, 48)


In [None]:
# XGBoost parameter
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 10,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "nthread": -1,
    "seed": 42
}

Training XGBoostRegressor ...
[0]	train-rmse:0.08830	test-rmse:0.08980
[50]	train-rmse:0.03639	test-rmse:0.03830
[100]	train-rmse:0.02519	test-rmse:0.02736
[150]	train-rmse:0.02049	test-rmse:0.02296
[200]	train-rmse:0.01799	test-rmse:0.02073
[250]	train-rmse:0.01634	test-rmse:0.01931
[299]	train-rmse:0.01516	test-rmse:0.01831


In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

print("Training XGBoostRegressor")
evals = [(dtrain, "train"), (dtest, "test")]
model = xgb.train(max, dtrain, num_boost_round=300, evals=evals, verbose_eval=50)

test_pred = model.predict(dtest)


In [20]:

model.save_model(OUT / "xgb_program_labelmatch_regressor.json")
print(f"Model saved: {OUT / 'xgb_program_labelmatch_regressor.json'}")

bundle = {
    "model": model,
    "mlb_int": mlb_int,
    "mlb_tag": mlb_tag,
}
joblib.dump(bundle, OUT / "xgb_program_labelmatch_regressor.pkl")
print(f"Bundle saved: {OUT / 'xgb_program_labelmatch_regressor.pkl'}")

out_df = test.copy()
out_df["pred_label_match"] = np.round(test_pred, 4)
out_df.to_csv(OUT / "xgb_program_test.csv", index=False)
print("Predicted results saved xgb_program_test.csv")


Model saved: /Users/zyh/Desktop/HDG_group/models/xgb_program_labelmatch_regressor.json
Bundle saved: /Users/zyh/Desktop/HDG_group/models/xgb_program_labelmatch_regressor.pkl
Predicted results saved xgb_program_test.csv


In [21]:

val_df = test[["student_id", "label_match"]].copy()
val_df["pred_label_match"] = np.asarray(test_pred, dtype=float)

rows = []
for sid, g in val_df.groupby("student_id", sort=True):
    y_true = g["label_match"].to_numpy().reshape(1, -1)
    y_pred = g["pred_label_match"].to_numpy().reshape(1, -1)
    rows.append({"student_id": sid, "nDCG@3": float(ndcg_score(y_true, y_pred, k=3))})

ndcg_XGB = pd.DataFrame(rows).sort_values("student_id").reset_index(drop=True)
mean_ndcg_XGB = float(ndcg_XGB["nDCG@3"].mean())

display(ndcg_XGB)
print("Mean nDCG@3", round(mean_ndcg_XGB, 6))

Unnamed: 0,student_id,nDCG@3
0,10,0.986383
1,1001,0.994730
2,1004,1.000000
3,1006,0.959101
4,1008,0.962909
...,...,...
1495,990,0.993832
1496,991,0.989798
1497,992,0.978485
1498,993,0.997970


Mean nDCG@3 0.982534
