In [13]:
# Import paths
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.sparse import hstack
from sklearn.metrics import ndcg_score

import joblib

DATA = OUT = Path.cwd()
OUT.mkdir(parents=True, exist_ok=True)
print("DATA =", DATA, "OUT =", OUT)

train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

print(f"Dataset sizes: train={len(train)}, test={len(test)}")


DATA = /Users/zyh/Desktop/HDG_group/models OUT = /Users/zyh/Desktop/HDG_group/models
Dataset sizes: train=340618, test=694


In [14]:
# Feature engineering: MLB one-hot on interests and field_tags
def to_list(s: str):
    return [x.strip().lower() for x in str(s).split(";") if x.strip()]

mlb_int = MultiLabelBinarizer(sparse_output=True)
mlb_tag = MultiLabelBinarizer(sparse_output=True)

X_tr_sparse = hstack([
    mlb_int.fit_transform(train["interests"].map(to_list)),
    mlb_tag.fit_transform(train["field_tags"].map(to_list))
], format="csr")

X_te_sparse = hstack([
    mlb_int.transform(test["interests"].map(to_list)),
    mlb_tag.transform(test["field_tags"].map(to_list))
], format="csr")

y_train = train["label_match"].values
y_test  = test["label_match"].values

X_tr_sparse.shape, X_te_sparse.shape


((340618, 48), (694, 48))

In [15]:
# Choose KNN variant
knn = KNeighborsRegressor(n_neighbors=5, weights="distance", metric="cosine")

In [16]:
# Train
def rmse(y_true, y_pred):
    from numpy import sqrt
    from sklearn.metrics import mean_squared_error
    return float(sqrt(mean_squared_error(y_true, y_pred)))

knn.fit(X_tr_sparse, y_train)
test_pred = knn.predict(X_te_sparse)

metrics = {
    "test_rmse": rmse(y_test, test_pred),
    "test_mae":  float(mean_absolute_error(y_test, test_pred)),
    "test_r2":   float(r2_score(y_test, test_pred)),
}
metrics


{'test_rmse': 0.02843740589972959,
 'test_mae': 0.018499140177731396,
 'test_r2': 0.8985210310644552}

In [17]:
# Save artifacts
def save_scored(df_split, preds, fname):
    out = df_split.copy()
    out["pred_label_match"] = np.round(preds, 4)
    out.to_csv(OUT / fname, index=False)

save_scored(test, test_pred, "knn_test.csv")

joblib.dump({"model": knn, "mlb_int": mlb_int, "mlb_tag": mlb_tag}, OUT / "model_KNN.pkl")
with open(OUT / "knn_results.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Artifacts saved to", OUT)

Artifacts saved to /Users/zyh/Desktop/HDG_group/models


In [18]:

val_df = test[["student_id", "label_match"]].copy()
val_df["pred_label_match"] = np.asarray(test_pred, dtype=float)

rows = []
for sid, g in val_df.groupby("student_id", sort=True):
    y_true = g["label_match"].to_numpy().reshape(1, -1)
    y_pred = g["pred_label_match"].to_numpy().reshape(1, -1)
    rows.append({"student_id": sid, "nDCG@3": float(ndcg_score(y_true, y_pred, k=3))})

ndcg_KNN = pd.DataFrame(rows).sort_values("student_id").reset_index(drop=True)
mean_ndcg_KNN = float(ndcg_KNN["nDCG@3"].mean())

display(ndcg_KNN)
print("Mean nDCG@3", round(mean_ndcg_KNN, 6))


Unnamed: 0,student_id,nDCG@3
0,427,0.987842
1,878,0.994123
2,1234,0.932392
3,1848,0.937314
4,2024,0.960822
5,2064,1.0
6,3023,0.988593
7,3228,0.945687
8,3678,0.972527
9,3984,1.0


Mean nDCG@3 0.97193
