In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from sklearn.metrics import mean_squared_error
from sklearn.metrics import ndcg_score
import xgboost as xgb
import joblib

ROOT = Path.cwd().parents[0]
DATA = ROOT / "models"
OUT  = ROOT / "models"
OUT.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

print(f"Dataset sizes: train={len(train)}, test={len(test)}")

Dataset sizes: train=664189, test=1311


In [2]:
required = {"student_id", "program_id", "interests", "field_tags", "label_match"}
for name, df in [("train", train), ("test", test)]:
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"[{name}] missing columns: {missing}")
    df["student_id"]  = df["student_id"].astype(str)
    df["program_id"]  = df["program_id"].astype(str)
    df["interests"]   = df["interests"].fillna("")
    df["field_tags"]  = df["field_tags"].fillna("")
    df["label_match"] = pd.to_numeric(df["label_match"], errors="coerce").fillna(0.0)

In [3]:
def to_list(s: str):
    return [x.strip().lower() for x in str(s).split(";") if x.strip()]

mlb_int = MultiLabelBinarizer(sparse_output=True)
mlb_tag = MultiLabelBinarizer(sparse_output=True)

X_train = hstack([
    mlb_int.fit_transform(train["interests"].map(to_list)),
    mlb_tag.fit_transform(train["field_tags"].map(to_list))
], format="csr")

X_test = hstack([
    mlb_int.transform(test["interests"].map(to_list)),
    mlb_tag.transform(test["field_tags"].map(to_list))
], format="csr")

y_train = train["label_match"].values
y_test  = test["label_match"].values

print("XGBoost feature shapes:", X_train.shape, X_test.shape)

XGBoost feature shapes: (664189, 48) (1311, 48)


In [4]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 10,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "nthread": -1,
    "seed": 42
}

print("Training XGBoostRegressor ...")
evals = [(dtrain, "train"), (dtest, "test")]
model = xgb.train(params, dtrain, num_boost_round=300, evals=evals, verbose_eval=50)


Training XGBoostRegressor ...
[0]	train-rmse:0.08875	test-rmse:0.08830
[50]	train-rmse:0.03714	test-rmse:0.03821
[100]	train-rmse:0.02565	test-rmse:0.02755
[150]	train-rmse:0.02117	test-rmse:0.02367
[200]	train-rmse:0.01875	test-rmse:0.02124
[250]	train-rmse:0.01711	test-rmse:0.01967
[299]	train-rmse:0.01596	test-rmse:0.01862


In [5]:
def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

test_pred = model.predict(dtest)
test_rmse = rmse(y_test, test_pred)
print(f"[TEST] RMSE = {test_rmse:.4f}")

[TEST] RMSE = 0.0186


In [6]:
joblib.dump(model, OUT / "xgb_labelmatch_regressor.pkl")
print(f"Model saved: {OUT / 'xgb_labelmatch_regressor.pkl'}")

out_df = test.copy()
out_df["pred_label_match"] = np.round(test_pred, 4)
out_df.to_csv(OUT / "xgb_test.csv", index=False)
print("Predicted results saved (xgb_test.csv).")

Model saved: /Users/zyh/Desktop/HDG_group/models/xgb_labelmatch_regressor.pkl
Predicted results saved (xgb_test.csv).


In [7]:

val_df = test[["student_id", "label_match"]].copy()
val_df["pred_label_match"] = np.asarray(test_pred, dtype=float)

rows = []
for sid, g in val_df.groupby("student_id", sort=True):
    y_true = g["label_match"].to_numpy().reshape(1, -1)
    y_pred = g["pred_label_match"].to_numpy().reshape(1, -1)
    rows.append({"student_id": sid, "nDCG@3": float(ndcg_score(y_true, y_pred, k=3))})

ndcg_XGB = pd.DataFrame(rows).sort_values("student_id").reset_index(drop=True)
mean_ndcg_XGB = float(ndcg_XGB["nDCG@3"].mean())

display(ndcg_XGB)
print("Mean nDCG@3", round(mean_ndcg_XGB, 6))

Unnamed: 0,student_id,nDCG@3
0,1043,0.973822
1,1245,0.974242
2,1340,0.995441
3,1680,0.942864
4,1745,0.98805
5,209,1.0
6,2134,0.994696
7,294,0.932086
8,3805,0.990734
9,4645,1.0


Mean nDCG@3 0.979193
