In [7]:
# Import paths
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.sparse import hstack
from sklearn.metrics import ndcg_score

import joblib

DATA = OUT = Path.cwd()
OUT.mkdir(parents=True, exist_ok=True)
print("DATA =", DATA, "OUT =", OUT)

train = pd.read_csv(DATA / "train_program.csv")
test  = pd.read_csv(DATA / "test_program.csv")

print(f"Dataset sizes: train={len(train)}, test={len(test)}")


DATA = /Users/zyh/Desktop/HDG_group/models OUT = /Users/zyh/Desktop/HDG_group/models
Dataset sizes: train=233158, test=99592


In [8]:
# Feature engineering: MLB one-hot on interests and field_tags
def to_list(s: str):
    return [x.strip().lower() for x in str(s).split(";") if x.strip()]

mlb_int = MultiLabelBinarizer(sparse_output=True)
mlb_tag = MultiLabelBinarizer(sparse_output=True)

X_tr_sparse = hstack([
    mlb_int.fit_transform(train["interests"].map(to_list)),
    mlb_tag.fit_transform(train["field_tags"].map(to_list))
], format="csr")

X_te_sparse = hstack([
    mlb_int.transform(test["interests"].map(to_list)),
    mlb_tag.transform(test["field_tags"].map(to_list))
], format="csr")

y_train = train["label_match"].values
y_test  = test["label_match"].values

X_tr_sparse.shape, X_te_sparse.shape


((233158, 48), (99592, 48))

In [9]:
# Choose KNN variant
knn = KNeighborsRegressor(n_neighbors=5, weights="distance", metric="cosine", n_jobs=-1)

In [10]:
# Train
knn.fit(X_tr_sparse, y_train)
test_pred = knn.predict(X_te_sparse)


In [11]:
# Save artifacts
def save_scored(df_split, preds, fname):
    out = df_split.copy()
    out["pred_label_match"] = np.round(preds, 4)
    out.to_csv(OUT / fname, index=False)

save_scored(test, test_pred, "knn_program_test.csv")

In [12]:

val_df = test[["student_id", "label_match"]].copy()
val_df["pred_label_match"] = np.asarray(test_pred, dtype=float)

rows = []
for sid, g in val_df.groupby("student_id", sort=True):
    y_true = g["label_match"].to_numpy().reshape(1, -1)
    y_pred = g["pred_label_match"].to_numpy().reshape(1, -1)
    rows.append({"student_id": sid, "nDCG@3": float(ndcg_score(y_true, y_pred, k=3))})

ndcg_KNN = pd.DataFrame(rows).sort_values("student_id").reset_index(drop=True)
mean_ndcg_KNN = float(ndcg_KNN["nDCG@3"].mean())

display(ndcg_KNN)
print("Mean nDCG@3", round(mean_ndcg_KNN, 6))


Unnamed: 0,student_id,nDCG@3
0,2,0.987274
1,3,0.969386
2,10,0.986164
3,12,0.941887
4,13,0.935630
...,...,...
1495,4983,0.951029
1496,4988,1.000000
1497,4989,0.992543
1498,4990,0.970543


Mean nDCG@3 0.954456
