**Load the artifact from disk**

In [13]:
import json
import numpy as np
from scipy import sparse

ART_PATH = r"C:\Users\Dewald\Documents\GitHub\2501PTDS-Unsupervised-Learning\Data\anime_hybrid_recommender.json"

# --- Load the JSON ---
with open(ART_PATH, "r", encoding="utf-8") as f:
    artifact = json.load(f)

# --- Restore matrix and variables ---
r = artifact["R_csr"]
R = sparse.csr_matrix((r["data"], r["indices"], r["indptr"]), shape=tuple(r["shape"]))

global_mean = artifact["global_mean"]
best_alpha = artifact["best_alpha"]
user_mean = np.array(artifact["user_mean"])
item_mean = np.array(artifact["item_mean"])

user_ids = np.array(artifact["user_ids"])
item_ids = np.array(artifact["item_ids"])

print("✅ Model restored")
print(f"Matrix shape: {R.shape}, best α = {best_alpha}")



✅ Model restored
Matrix shape: (69481, 9838), best α = 0.7000000000000001


**Load the anime metadata**

In [14]:
import pandas as pd

ANIME_PATH = r"C:\Users\Dewald\Documents\GitHub\2501PTDS-Unsupervised-Learning\Data\anime.csv"
anime_df = pd.read_csv(ANIME_PATH)

# Create a quick lookup dictionary for names
anime_lookup = dict(zip(anime_df["anime_id"], anime_df["name"]))


**Define helper to predict rating for a specific user/anime**

In [15]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

# --- Content-based neighbors ---
meta_cols = [c for c in ["genre", "type", "name", "episodes"] if c in anime_df.columns]
anime_df["__text__"] = anime_df[meta_cols].astype(str).agg(" ".join, axis=1)
anime_meta = anime_df[anime_df["anime_id"].isin(item_ids)][["anime_id", "__text__"]]

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=3)
tfidf_item = tfidf.fit_transform(anime_meta["__text__"])

cb_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=30)
cb_knn.fit(tfidf_item)
cb_dists, cb_inds = cb_knn.kneighbors(tfidf_item, n_neighbors=30, return_distance=True)
cb_sims = 1.0 - cb_dists

# --- Collaborative neighbors ---
cf_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=30)
cf_knn.fit(R.T)
cf_dists, cf_inds = cf_knn.kneighbors(R.T, n_neighbors=30, return_distance=True)
cf_sims = 1.0 - cf_dists


**Define rating predictor**

In [16]:
# --- 5. Predictors with safety checks ---

# Rebuild per-user rating dicts for quick lookups
user_rdict = []
for u in range(R.shape[0]):
    s, e = R.indptr[u], R.indptr[u + 1]
    user_rdict.append({int(i): float(r) for i, r in zip(R.indices[s:e], R.data[s:e])})


def predict_from_neighbors(uidx: int, iidx: int,
                           neigh_idx: np.ndarray, neigh_sim: np.ndarray) -> float:
    """
    Weighted average of user's ratings on similar items.
    Safe against out-of-range neighbor indices.
    """
    max_idx = R.shape[1] - 1
    # Filter invalid neighbor indices
    valid_pairs = [(int(nb), float(s))
                   for nb, s in zip(neigh_idx[iidx], neigh_sim[iidx])
                   if 0 <= nb <= max_idx]

    numer = denom = 0.0
    rdict = user_rdict[uidx]
    for nb, s in valid_pairs:
        r = rdict.get(nb)
        if r is not None:
            numer += s * r
            denom += abs(s)

    if denom > 0:
        return numer / denom
    # Fallback if user hasn't rated any neighbor
    return float(0.5 * user_mean[uidx] + 0.5 * item_mean[iidx])


def predict_hybrid(uidx: int, iidx: int, alpha: float) -> float:
    """Blend CF + CB predictions."""
    p_cf = predict_from_neighbors(uidx, iidx, cf_inds, cf_sims)
    p_cb = predict_from_neighbors(uidx, iidx, cb_inds, cb_sims)
    return float(alpha * p_cf + (1 - alpha) * p_cb)



**Recommend new anime for a user**

In [17]:
def recommend_for_user(user_id: int, top_n: int = 10) -> pd.DataFrame:
    """
    Recommend top N anime for a given user_id using the hybrid model.
    Automatically skips items already rated.
    """
    if user_id not in user_ids:
        print(f"⚠️ User ID {user_id} not found in training data.")
        return pd.DataFrame()

    uidx = np.where(user_ids == user_id)[0][0]
    rated_items = set(user_rdict[uidx].keys())

    preds = []
    for iidx in range(R.shape[1]):
        if iidx not in rated_items:
            try:
                p = predict_hybrid(uidx, iidx, best_alpha)
            except IndexError:
                # Extra safety: skip if any index mismatch occurs
                continue
            preds.append((iidx, p))

    # Sort and take top-N
    preds = sorted(preds, key=lambda x: x[1], reverse=True)[:top_n]

    recs = pd.DataFrame({
        "anime_id": [int(item_ids[i]) for i, _ in preds],
        "predicted_rating": [round(float(p), 2) for _, p in preds]
    })
    recs["name"] = recs["anime_id"].map(anime_lookup)
    return recs[["anime_id", "name", "predicted_rating"]]



**Try it!**

In [18]:
print("R items:", R.shape[1])
print("TF-IDF items:", tfidf_item.shape[0])


R items: 9838
TF-IDF items: 9837


In [20]:
# Example: Recommend top 10 anime for user_id = 12345 (replace with a valid ID from train_df)
user_id = 40763
recommendations = recommend_for_user(user_id, top_n=10)
print(recommendations)


   anime_id                                               name  \
0      5163   Code Geass: Hangyaku no Lelouch R2 Picture Drama   
1      7270              Mobile Suit Gundam 00 Special Edition   
2     12685                 Code Geass: Nunnally in Wonderland   
3     17949                       The Everlasting Guilty Crown   
4       578                                     Hotaru no Haka   
5      9023  Katekyo Hitman Reborn!: Mr. Rebokku no Ciao Ci...   
6       552                                  Digimon Adventure   
7      5468                                Yattokame Tanteidan   
8      5606                              Ryoujoku Joshi Gakuen   
9      8196  Kawasaki Frontale x Tentai Senshi Sunred 2nd S...   

   predicted_rating  
0              8.09  
1              8.00  
2              7.88  
3              7.82  
4              7.80  
5              7.80  
6              7.73  
7              7.71  
8              7.71  
9              7.71  
