In [6]:
# 0) Imports & paths
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.sparse import hstack

import joblib

DATA = OUT = Path.cwd()
OUT.mkdir(parents=True, exist_ok=True)
print("DATA =", DATA, "OUT =", OUT)

train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

print(f"Dataset sizes: train={len(train)}, test={len(test)}")


DATA = /Users/zyh/Desktop/HDG_group/models OUT = /Users/zyh/Desktop/HDG_group/models
Dataset sizes: train=273049, test=68263


In [7]:
# 1) Feature engineering: MLB one-hot on interests & field_tags
def to_list(s: str):
    return [x.strip().lower() for x in str(s).split(";") if x.strip()]

mlb_int = MultiLabelBinarizer(sparse_output=True)
mlb_tag = MultiLabelBinarizer(sparse_output=True)

X_tr_sparse = hstack([
    mlb_int.fit_transform(train["interests"].map(to_list)),
    mlb_tag.fit_transform(train["field_tags"].map(to_list))
], format="csr")

X_te_sparse = hstack([
    mlb_int.transform(test["interests"].map(to_list)),
    mlb_tag.transform(test["field_tags"].map(to_list))
], format="csr")

y_train = train["label_match"].values
y_test  = test["label_match"].values

X_tr_sparse.shape, X_te_sparse.shape


((273049, 48), (68263, 48))

In [8]:
# 2) Choose KNN variant
knn = KNeighborsRegressor(n_neighbors=5, weights="distance", metric="cosine")

In [9]:
# 3) Train
def rmse(y_true, y_pred):
    from numpy import sqrt
    from sklearn.metrics import mean_squared_error
    return float(sqrt(mean_squared_error(y_true, y_pred)))

knn.fit(X_tr_sparse, y_train)
test_pred = knn.predict(X_te_sparse)

metrics = {
    "test_rmse": rmse(y_test, test_pred),
    "test_mae":  float(mean_absolute_error(y_test, test_pred)),
    "test_r2":   float(r2_score(y_test, test_pred)),
}
metrics


{'test_rmse': 0.034214340476725856,
 'test_mae': 0.023618959181638807,
 'test_r2': 0.8564126061946524}

In [10]:
# Save artifacts
def save_scored(df_split, preds, fname):
    out = df_split.copy()
    out["pred_label_match"] = np.round(preds, 4)
    out.to_csv(OUT / fname, index=False)

save_scored(test, test_pred, "knn_test.csv")

joblib.dump({"model": knn, "mlb_int": mlb_int, "mlb_tag": mlb_tag}, OUT / "model_KNN.pkl")
with open(OUT / "knn_results.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Artifacts saved to", OUT)

Artifacts saved to /Users/zyh/Desktop/HDG_group/models
