# Setup & Imports

In [1]:
# 1) Imports
import os, math, json, numpy as np, pandas as pd
from typing import Dict, List, Tuple
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


# Config & Data Loading

In [2]:
# 2) Config & paths
DATA_DIR   = r"C:\Users\Dewald\Documents\GitHub\2501PTDS-Unsupervised-Learning\Data"
ANIME_PATH = os.path.join(DATA_DIR, "anime.csv")
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")

USER_COL, ITEM_COL, RATE_COL = "user_id", "anime_id", "rating"

# Toggle this for faster trial runs (recommended first):
SUBSET_DEMO = False         # True = use top users/items to shrink the matrix; False = full data
TOP_USERS   = 2000          # used only when SUBSET_DEMO=True
TOP_ITEMS   = 3000

# 2.1 Load
anime_df = pd.read_csv(ANIME_PATH)
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

# 2.2 Clean ratings (remove NaNs and -1)
train_df = train_df[pd.to_numeric(train_df[RATE_COL], errors="coerce").notna()].copy()
train_df[RATE_COL] = train_df[RATE_COL].astype(float)
train_df = train_df[train_df[RATE_COL] >= 0].copy()

rating_min = float(train_df[RATE_COL].min())
rating_max = float(train_df[RATE_COL].max())
global_mean = float(train_df[RATE_COL].mean())
print(f"Train: {train_df.shape}, Test: {test_df.shape}, Rating range [{rating_min}, {rating_max}], mean={global_mean:.3f}")


Train: (5703555, 3), Test: (633686, 2), Rating range [1.0, 10.0], mean=7.809


# Content Text (TF-IDF features)

In [3]:
# Build a light text field from available metadata
meta_cols = [c for c in ["genre", "type", "name", "episodes"] if c in anime_df.columns]
anime_df["__text__"] = anime_df[meta_cols].astype(str).agg(" ".join, axis=1)

# Keep only items we have in train
item_ids = pd.Index(sorted(train_df[ITEM_COL].unique()))
anime_meta = pd.DataFrame({ITEM_COL: item_ids}).merge(
    anime_df[[ITEM_COL, "__text__"]], on=ITEM_COL, how="left"
).fillna({"__text__": ""})


# ID Maps & Sparse Matrix

In [4]:
# Map IDs → contiguous indices
user_ids = pd.Index(sorted(train_df[USER_COL].unique()))
user_to_index = {uid: i for i, uid in enumerate(user_ids)}
item_to_index = {iid: i for i, iid in enumerate(item_ids)}

n_users, n_items = len(user_ids), len(item_ids)
print(f"Users={n_users}, Items={n_items}")

# Build CSR user-item ratings matrix
rows = train_df[USER_COL].map(user_to_index).values
cols = train_df[ITEM_COL].map(item_to_index).values
vals = train_df[RATE_COL].values
R = sparse.coo_matrix((vals, (rows, cols)), shape=(n_users, n_items)).tocsr()

def csr_means(csr: sparse.csr_matrix, axis: int = 1) -> np.ndarray:
    if axis == 1:
        means = np.zeros(csr.shape[0], dtype=np.float64)
        for i in range(csr.shape[0]):
            s, e = csr.indptr[i], csr.indptr[i+1]
            means[i] = csr.data[s:e].mean() if e > s else global_mean
        return means
    return csr_means(csr.T.tocsr(), axis=1)

user_mean = csr_means(R, axis=1)
item_mean = csr_means(R, axis=0)

# Per-user rating dict for O(1) lookups during prediction
user_rdict: List[Dict[int, float]] = [dict() for _ in range(n_users)]
for u in range(n_users):
    s, e = R.indptr[u], R.indptr[u+1]
    items_u, ratings_u = R.indices[s:e], R.data[s:e]
    user_rdict[u] = {int(i): float(r) for i, r in zip(items_u, ratings_u)}


Users=69481, Items=9838


# Neighbors (kNN) for CF and CB

In [5]:
# k of neighbors
K = 30 if not SUBSET_DEMO else 25

# 6.1 Collaborative: item vectors are columns in R -> fit kNN once on R.T
from sklearn.neighbors import NearestNeighbors
cf_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=min(K+1, max(2, n_items)))
cf_knn.fit(R.T)
cf_dists, cf_inds = cf_knn.kneighbors(R.T, n_neighbors=min(K+1, n_items), return_distance=True)
cf_sims = 1.0 - cf_dists
# drop self (first neighbor is the item itself)
cf_inds, cf_sims = cf_inds[:, 1:], cf_sims[:, 1:]

# 6.2 Content: TF-IDF on metadata text
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=40000 if not SUBSET_DEMO else 20000, ngram_range=(1,2), min_df=3)
tfidf_item = tfidf.fit_transform(anime_meta["__text__"])

cb_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=min(K+1, max(2, n_items)))
cb_knn.fit(tfidf_item)
cb_dists, cb_inds = cb_knn.kneighbors(tfidf_item, n_neighbors=min(K+1, n_items), return_distance=True)
cb_sims = 1.0 - cb_dists
cb_inds, cb_sims = cb_inds[:, 1:], cb_sims[:, 1:]


# Predictors (CF, CB, Hybrid)

In [6]:
def predict_from_neighbors(uidx: int, iidx: int, neigh_idx: np.ndarray, neigh_sim: np.ndarray) -> float:
    # Weighted mean of user's ratings on similar items
    numer = 0.0; denom = 0.0
    rdict = user_rdict[uidx]
    for nb, s in zip(neigh_idx[iidx], neigh_sim[iidx]):
        r = rdict.get(int(nb))
        if r is not None:
            numer += s * r; denom += abs(s)
    if denom > 0:
        return float(numer / denom)
    # Backoffs
    return float(0.5 * user_mean[uidx] + 0.5 * item_mean[iidx])

def predict_hybrid(uidx: int, iidx: int, alpha: float) -> float:
    p_cf = predict_from_neighbors(uidx, iidx, cf_inds, cf_sims)
    p_cb = predict_from_neighbors(uidx, iidx, cb_inds, cb_sims)
    return float(alpha * p_cf + (1.0 - alpha) * p_cb)

def predict_single(uid: int, iid: int, alpha: float) -> float:
    uidx = user_to_index.get(uid); iidx = item_to_index.get(iid)
    if uidx is not None and iidx is not None:
        p = predict_hybrid(uidx, iidx, alpha)
    elif uidx is not None:
        p = float(user_mean[uidx])
    elif iidx is not None:
        p = float(item_mean[iidx])
    else:
        p = global_mean
    return float(np.clip(p, rating_min, rating_max))


# Tune α on a Validation Split

In [7]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

def rmse_score(y_true, y_pred):
    # Works on both old and new scikit-learn
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

In [8]:
# 90/10 split on observed ratings
train_part, val_part = train_test_split(train_df[[USER_COL, ITEM_COL, RATE_COL]], test_size=0.10, random_state=42)

# In full mode this can be heavy; for faster tuning you can sample:
if not SUBSET_DEMO:
    val_part = val_part.sample(n=min(50000, len(val_part)), random_state=42).reset_index(drop=True)

def batch_metrics(df, alpha: float):
    preds, trues = [], []
    for uid, iid, y in df[[USER_COL, ITEM_COL, RATE_COL]].itertuples(index=False):
        preds.append(predict_single(uid, iid, alpha))
        trues.append(float(y))
    rmse = rmse_score(trues, preds)              
    mae  = mean_absolute_error(trues, preds)
    return rmse, mae

alphas = np.linspace(0.0, 1.0, 11)
grid = []
best = (None, float("inf"), None)
for a in alphas:
    rmse, mae = batch_metrics(val_part, a)
    grid.append((a, rmse, mae))
    if rmse < best[1]:
        best = (a, rmse, mae)

best_alpha, best_val_rmse, best_val_mae = best
print(f"Best α={best_alpha:.2f} | Val RMSE={best_val_rmse:.4f} | Val MAE={best_val_mae:.4f}")

val_results = pd.DataFrame(grid, columns=["alpha","rmse","mae"]).sort_values("rmse")
display(val_results.head(5))


Best α=0.70 | Val RMSE=1.1851 | Val MAE=0.8701


Unnamed: 0,alpha,rmse,mae
7,0.7,1.185081,0.870059
6,0.6,1.186879,0.871523
8,0.8,1.190078,0.872965
5,0.5,1.195441,0.877153
9,0.9,1.201784,0.880196


# Predict test.csv and Save

In [9]:
# If FULL mode is too slow in your environment, do it in batches or switch SUBSET_DEMO=True first.
preds = []
BATCH = 200_000  # tweak as needed for memory/time
for start in range(0, len(test_df), BATCH):
    block = test_df.iloc[start:start+BATCH]
    block_pred = [predict_single(u, i, best_alpha) for u, i in block[[USER_COL, ITEM_COL]].itertuples(index=False)]
    preds.extend(block_pred)

sub = test_df.copy()
sub[RATE_COL] = preds[:len(test_df)]
SUB_PATH = os.path.join(DATA_DIR, "predictions.csv")
sub.to_csv(SUB_PATH, index=False)
print(f"Saved: {SUB_PATH}")


Saved: C:\Users\Dewald\Documents\GitHub\2501PTDS-Unsupervised-Learning\Data\predictions.csv


# (Optional) Save a Lightweight Artifact

In [10]:
# 10) Save lightweight artifact safely
import json, numpy as np

def save_csr(csr):
    return {
        "data": csr.data.astype(float).tolist(),
        "indices": csr.indices.astype(int).tolist(),
        "indptr": csr.indptr.astype(int).tolist(),
        "shape": [int(x) for x in csr.shape],
    }

def np_encoder(obj):
    if isinstance(obj, np.generic):
        return obj.item()
    raise TypeError

artifact = {
    "global_mean": float(global_mean),
    "rating_min": float(rating_min),
    "rating_max": float(rating_max),
    "best_alpha": float(best_alpha),
    "user_ids": [int(u) for u in user_ids],
    "item_ids": [int(i) for i in item_ids],
    "user_mean": [float(x) for x in user_mean],
    "item_mean": [float(x) for x in item_mean],
    "R_csr": save_csr(R),
}

ART_PATH = os.path.join(DATA_DIR, "anime_hybrid_recommender.json")
with open(ART_PATH, "w", encoding="utf-8") as f:
    json.dump(artifact, f, default=np_encoder)

print(f"✅ Model artifact saved successfully at: {ART_PATH}")



✅ Model artifact saved successfully at: C:\Users\Dewald\Documents\GitHub\2501PTDS-Unsupervised-Learning\Data\anime_hybrid_recommender.json


In [11]:
import json

ART_PATH = r"C:\Users\Dewald\Documents\GitHub\2501PTDS-Unsupervised-Learning\Data\anime_hybrid_recommender.json"

with open(ART_PATH, "r", encoding="utf-8") as f:
    artifact = json.load(f)

# Inspect the keys
print(artifact.keys())

# Example: access values
print("Global mean:", artifact["global_mean"])
print("Best alpha:", artifact["best_alpha"])
print("Matrix shape:", artifact["R_csr"]["shape"])


dict_keys(['global_mean', 'rating_min', 'rating_max', 'best_alpha', 'user_ids', 'item_ids', 'user_mean', 'item_mean', 'R_csr'])
Global mean: 7.808690544756735
Best alpha: 0.7000000000000001
Matrix shape: [69481, 9838]
