In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import joblib

ROOT = Path.cwd().parents[0]
DATA = ROOT / "models"
OUT  = ROOT / "models"
OUT.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

print(f"Dataset sizes: train={len(train)}, test={len(test)}")

Dataset sizes: train=273049, test=68263


In [3]:
required = {"student_id", "program_id", "interests", "field_tags", "label_match"}
for name, df in [("train", train), ("test", test)]:
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"[{name}] missing columns: {missing}")
    df["student_id"]  = df["student_id"].astype(str)
    df["program_id"]  = df["program_id"].astype(str)
    df["interests"]   = df["interests"].fillna("")
    df["field_tags"]  = df["field_tags"].fillna("")
    df["label_match"] = pd.to_numeric(df["label_match"], errors="coerce").fillna(0.0)

In [4]:
def to_list(s: str):
    return [x.strip().lower() for x in str(s).split(";") if x.strip()]

mlb_int = MultiLabelBinarizer(sparse_output=True)
mlb_tag = MultiLabelBinarizer(sparse_output=True)

X_train = hstack([
    mlb_int.fit_transform(train["interests"].map(to_list)),
    mlb_tag.fit_transform(train["field_tags"].map(to_list))
], format="csr")

X_test = hstack([
    mlb_int.transform(test["interests"].map(to_list)),
    mlb_tag.transform(test["field_tags"].map(to_list))
], format="csr")

y_train = train["label_match"].values
y_test  = test["label_match"].values

print("XGBoost feature shapes:", X_train.shape, X_test.shape)

XGBoost feature shapes: (273049, 48) (68263, 48)


In [5]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 10,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "nthread": -1,
    "seed": 42
}

print("Training XGBoostRegressor ...")
evals = [(dtrain, "train"), (dtest, "test")]
model = xgb.train(params, dtrain, num_boost_round=300, evals=evals, verbose_eval=50)


Training XGBoostRegressor ...
[0]	train-rmse:0.08790	test-rmse:0.08755
[50]	train-rmse:0.03597	test-rmse:0.03637
[100]	train-rmse:0.02492	test-rmse:0.02564
[150]	train-rmse:0.02016	test-rmse:0.02119
[200]	train-rmse:0.01764	test-rmse:0.01893
[250]	train-rmse:0.01600	test-rmse:0.01751
[299]	train-rmse:0.01481	test-rmse:0.01646


In [6]:
def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

test_pred = model.predict(dtest)
test_rmse = rmse(y_test, test_pred)
print(f"[TEST] RMSE = {test_rmse:.4f}")

[TEST] RMSE = 0.0165


In [7]:
joblib.dump(model, OUT / "xgb_labelmatch_regressor.pkl")
print(f"Model saved: {OUT / 'xgb_labelmatch_regressor.pkl'}")

out_df = test.copy()
out_df["pred_label_match"] = np.round(test_pred, 4)
out_df.to_csv(OUT / "xgb_test.csv", index=False)
print("Predicted results saved (xgb_test.csv).")

Model saved: /Users/zyh/Desktop/HDG_group/models/xgb_labelmatch_regressor.pkl
Predicted results saved (xgb_test.csv).
