In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity between vectors
from scipy.spatial.distance import pdist, squareform  # For pairwise distance computations and converting to a square matrix
import pickle
import math

# pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [4]:
#---------- load ----------
with open("rating_df_final.pk", "rb") as f:
    rating_df = pickle.load(f)
with open("anime_df_final.pk", "rb") as f:
    anime_df = pickle.load(f)

In [4]:
anime_df.head()

Unnamed: 0,anime_name,anime_id,Genres,Score,Synopsis,Score_num
0,Fullmetal Alchemist: Brotherhood,5114,"Action, Military, Adventure, Comedy, Drama, Ma...",9.19,"""In order for something to be obtained, someth...",9.19
1,Shingeki no Kyojin: The Final Season,40028,"Action, Military, Mystery, Super Power, Drama,...",9.17,Gabi Braun and Falco Grice have been training ...,9.17
2,Steins;Gate,9253,"Thriller, Sci-Fi",9.11,The self-proclaimed mad scientist Rintarou Oka...,9.11
3,Hunter x Hunter (2011),11061,"Action, Adventure, Fantasy, Shounen, Super Power",9.1,Hunter x Hunter is set in a world where Hunter...,9.1
4,Shingeki no Kyojin Season 3 Part 2,38524,"Action, Drama, Fantasy, Military, Mystery, Sho...",9.1,Seeking to restore humanity's diminishing hope...,9.1


In [14]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
4511,36,6512,7,Nyan Koi!
4512,36,5958,7,Sora no Otoshimono
4513,36,6802,8,So Ra No Wo To
4514,36,17187,8,Koukaku Kidoutai Arise: Ghost in the Shell - B...
4515,36,16498,8,Shingeki no Kyojin


In [6]:
def recommend_by_jaccard(
    title,
    anime_df,
    top_n=10,
    precomputed=None  # None, DataFrame 
):
    """
    recommend_by_jaccard is a function that recommends similar anime using Jaccard similarity
    based on either Genres or Themes.

    :param title: str
        The anime title (anime_name) to base recommendations on.

    :param anime_df: pd.DataFrame
        DataFrame containing at least 'anime_name' and the selected feature column ('Genres' or 'Themes').

    :param top_n: int
        Number of top similar results to return.

    :param precomputed: np.ndarray or None
        Optional precomputed Jaccard distance array to avoid recalculating distances.

    :return: dict
        A dictionary with:
            { "top": pd.Series } — the top-N most similar anime and their similarity scores.
    """

    type = 'Genres'
    # Check if title exists in the dataset
    if title not in anime_df['anime_name'].values:
        raise ValueError(f"'{title}' not found in dataset.")

    def compute_jaccard(df, col):
        cross_tab = pd.crosstab(df['anime_name'], df[col])
        distances = pdist(cross_tab.values, metric='jaccard')
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)
    
    def compute_jaccard_array(df, col, arr):
        cross_tab = pd.crosstab(df['anime_name'], df[col])
        distances = arr
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)
        
    df_jaccard = anime_df[["anime_name", type]]


    sim = precomputed
    if sim is None:
        sim = compute_jaccard(df_jaccard, type)
        if title not in sim.index:
            raise ValueError(f"'{title}' not found in similarity data.")
        top = sim.loc[title].sort_values(ascending=False)[1:top_n+1]
        return {"top": top}
        
    else:
        sim_df = compute_jaccard_array(df_jaccard, type, sim)

        if title not in sim_df.index:
            raise ValueError(f"'{title}' not found in similarity data.")
        top = sim_df.loc[title].sort_values(ascending=False)[1:top_n+1]
        return {"top": top}


In [7]:
recommend_by_jaccard("One Punch Man", anime_df)

{'top': anime_name
 One Punch Man 2nd Season             1.0
 One Punch Man 2nd Season Specials    1.0
 One Punch Man Specials               1.0
 One Punch Man: Road to Hero          1.0
 Zoids Shinseiki/Zero                 0.0
 5-toubun no Hanayome                 0.0
 xxxHOLiC Shunmuki                    0.0
 3-gatsu no Lion                      0.0
 Zero no Tsukaima                     0.0
 Zero no Tsukaima F                   0.0
 Name: One Punch Man, dtype: float64}

In [8]:
def tf_id_rec(title, anime_df, top_n, precomputed=None):
    """
    tf_id_rec is a function that recommends the most similar anime 
    using TF-IDF cosine similarity based on the Synopsis field.

    :param title: str
        The anime title (anime_name) to base recommendations on.

    :param anime_df: pd.DataFrame
        DataFrame containing at least 'anime_name' and 'Synopsis' columns.

    :param top_n: int
        The number of top similar anime to return (excluding the anime itself).

    :param precomputed: np.ndarray or None
        Optional precomputed cosine similarity matrix. If provided, it will be used instead of recomputing.

    :return: dict
        A dictionary with:
            { "top": pd.Series } — the top-N most similar anime and their similarity scores.
    """
    
# 1) verify the given title actually exists in the dataset
    if title not in anime_df['anime_name'].values:
        raise ValueError("'{0}' not found in dataset.".format(title))

    # 2) select only the columns we need and drop any rows where Synopsis is missing
    df_content = anime_df[['anime_name', 'Synopsis']].dropna(subset=['Synopsis'])

    # 3) build the TF-IDF matrix over all synopses
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')
    tfidf_mat = vectorizer.fit_transform(df_content['Synopsis'])
    tfidf_df  = pd.DataFrame(
        tfidf_mat.toarray(),
        index=df_content['anime_name'],
        columns=vectorizer.get_feature_names_out()
    )

    # 4) if a full cosine-similarity matrix was provided, reuse it
    if precomputed is not None:
        sim_df = pd.DataFrame(
            precomputed,
            index=tfidf_df.index,
            columns=tfidf_df.index
        )
        sims = sim_df.loc[title].sort_values(ascending=False)
        return {"top": sims}

    # 5) otherwise compute similarity between the target and every other anime
    target_vec   = tfidf_df.loc[title].values.reshape(1, -1)
    other_df     = tfidf_df.drop(title, axis=0)
    scores       = cosine_similarity(target_vec, other_df.values)[0]
    result_series = pd.Series(scores, index=other_df.index)

    # 6) pick the top_n highest-scoring titles
    top_similar = result_series.sort_values(ascending=False).iloc[:top_n]

    return {"top": top_similar}


In [9]:
tf_id_rec("Grand Blue", anime_df)

TypeError: tf_id_rec() missing 1 required positional argument: 'top_n'

<h1>USER BASED CF<h1>

In [28]:
user2movie = rating_df.groupby('user_id')['anime_id'].apply(list).to_dict()
movie2user = rating_df.groupby('anime_id')['user_id'].apply(list).to_dict()
user_movie = zip(rating_df['user_id'], rating_df['anime_id'])
user_movie_rating = zip(user_movie, rating_df['rating'])
user_movie2rating = dict(user_movie_rating)

In [29]:
def compute_similarity_matrix(user2movie, user_movie2rating, user_avg, min_common):
    """
    compute_similarity_matrix is a function that precomputes the similarity for each pair of users in 
    the training set and saves the similarity scores in a dictionary.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param min_common: the required minimum number of common movies between a pair of movies to be eligible
    for similarity calculation.
    :return: a nested dictionary where the key is a user and the value is a dictionary of the similarity 
    score between the key user and all the other users.
    """
    similarity_matrix = {}
    all_users = list(user2movie.keys())
    print(len(all_users))
    for i, user1 in enumerate(all_users):
        if user1 not in similarity_matrix:
            similarity_matrix[user1] = {}
    
        if (i + 1) % 100 == 0:
            print('{} users processed'.format(i + 1))

        for j in range(i + 1, len(all_users)):
            user2 = all_users[j]
            pearson_similarity = calculate_pearson_similarity(user1, user2, user2movie, user_movie2rating, user_avg, min_common)
            similarity_matrix[user1][user2] = pearson_similarity
            
            if user2 not in similarity_matrix:
                similarity_matrix[user2] = {}
            similarity_matrix[user2][user1] = pearson_similarity

    return similarity_matrix

In [None]:
# ================================================================
#  USER-BASED  &  ITEM-BASED  COLLABORATIVE FILTERING – EVALUATION
# ================================================================
import pandas as pd, numpy as np, math
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm.notebook import tqdm   # nice progress bars

# ----------------------------------------------------------------
# 0.  TRAIN / TEST SPLIT
# ----------------------------------------------------------------
train_df, test_df = train_test_split(
    rating_df,                   # -> must already be loaded
    test_size=0.2,
    stratify=rating_df["user_id"],
    random_state=42
)

# ----------------------------------------------------------------
# 1.  HELPER DICTIONARIES  (built from **training** only)
# ----------------------------------------------------------------
user2movie = train_df.groupby("user_id")["anime_id"].apply(list).to_dict()
movie2user = train_df.groupby("anime_id")["user_id"].apply(list).to_dict()
user_movie2rating = {
    (r.user_id, r.anime_id): r.rating for r in train_df.itertuples()
}
user_avg = train_df.groupby("user_id")["rating"].mean().to_dict()
global_mean = train_df["rating"].mean()

# ----------------------------------------------------------------
# 2.  USER-USER  PEARSON SIMILARITY  (min_common ≥ 5)
# ----------------------------------------------------------------
def build_user_similarity(min_common=5):
    sim = defaultdict(dict)
    for u, movies_u in tqdm(user2movie.items(), desc="user-sim"):
        set_u = set(movies_u)
        mu_u  = user_avg[u]
        for v, movies_v in user2movie.items():
            if v <= u:                  # symmetry & skip self
                continue
            common = set_u & set(movies_v)
            if len(common) < min_common:
                continue
            mu_v = user_avg[v]
            num = den_u = den_v = 0.0
            for m in common:
                d_u = user_movie2rating[(u, m)] - mu_u
                d_v = user_movie2rating[(v, m)] - mu_v
                num += d_u * d_v
                den_u += d_u**2
                den_v += d_v**2
            if den_u and den_v:
                s = num / math.sqrt(den_u * den_v)
                sim[u][v] = sim[v][u] = s
    return sim

user_similarity = build_user_similarity(min_common=5)

# ----------------------------------------------------------------
# 3.  ITEM-ITEM  PEARSON SIMILARITY  (min_common ≥ 5)
# ----------------------------------------------------------------
def build_item_similarity(min_common=5):
    sim = defaultdict(dict)
    for i, users_i in tqdm(movie2user.items(), desc="item-sim"):
        set_i = set(users_i)
        mu_i  = train_df.loc[train_df.anime_id == i, "rating"].mean()
        for j, users_j in movie2user.items():
            if j <= i:                  # symmetry & skip self
                continue
            common = set_i & set(users_j)
            if len(common) < min_common:
                continue
            mu_j = train_df.loc[train_df.anime_id == j, "rating"].mean()
            num = den_i = den_j = 0.0
            for u in common:
                d_i = user_movie2rating[(u, i)] - mu_i
                d_j = user_movie2rating[(u, j)] - mu_j
                num += d_i * d_j
                den_i += d_i**2
                den_j += d_j**2
            if den_i and den_j:
                s = num / math.sqrt(den_i * den_j)
                sim[i][j] = sim[j][i] = s
    return sim

item_similarity = build_item_similarity(min_common=5)

# ----------------------------------------------------------------
# 4-A.  USER-CF  PREDICT  (neighbourhood size k)
# ----------------------------------------------------------------
def predict_user_cf(user, movie, k=25, sim_cutoff=0.05):
    if (user in user_avg) and (movie in movie2user):
        num = den = 0.0
        # loop over users who rated the target movie
        for v in movie2user[movie]:
            if v == user:
                continue
            s = user_similarity[user].get(v, 0.0)
            if abs(s) < sim_cutoff:
                continue
            num += s * (user_movie2rating[(v, movie)] - user_avg[v])
            den += abs(s)
        if den:
            return user_avg[user] + num / den
        return user_avg[user]
    # cold-start fallback
    return global_mean

# ----------------------------------------------------------------
# 4-B.  ITEM-CF  PREDICT  (neighbourhood size k)
# ----------------------------------------------------------------
def predict_item_cf(user, movie, k=25, sim_cutoff=0.05):
    if (user in user2movie) and (movie in item_similarity):
        num = den = 0.0
        for j in user2movie[user]:
            if j == movie:
                continue
            s = item_similarity[movie].get(j, 0.0)
            if abs(s) < sim_cutoff:
                continue
            r = user_movie2rating[(user, j)]
            mu_j = train_df.loc[train_df.anime_id == j, "rating"].mean()
            num += s * (r - mu_j)
            den += abs(s)
        mu_i = train_df.loc[train_df.anime_id == movie, "rating"].mean()
        if den:
            return mu_i + num / den
        return mu_i
    # cold-start fallback
    return global_mean

# ----------------------------------------------------------------
# 5.  RMSE EVALUATION LOOP
# ----------------------------------------------------------------
def rmse(predict_fn):
    se = []
    for row in test_df.itertuples():
        se.append((row.rating - predict_fn(row.user_id, row.anime_id))**2)
    return np.sqrt(np.mean(se))

rmse_user = rmse(lambda u, m: predict_user_cf(u, m, k=25))
rmse_item = rmse(lambda u, m: predict_item_cf(u, m, k=25))

print("User-CF  RMSE:", round(rmse_user, 4))
print("Item-CF  RMSE:", round(rmse_item, 4))


user-sim:   0%|          | 0/10000 [00:00<?, ?it/s]

item-sim:   0%|          | 0/9672 [00:00<?, ?it/s]

In [None]:
# ==============================================================
#  EXTRA EVALUATION –  Precision, Recall, F1 and PR-Curve
#  --------------------------------------------------------------
#  We treat each user’s rows in *test_df* as “ground-truth likes”.
#  For every user we ask the CF model for its top-k unseen items,
#  then measure how many of those were actually in that user’s
#  withheld ratings.
#
#  Metrics:
#     · Precision@k  (how many recommended were relevant)
#     · Recall@k     (how many relevant were retrieved)
#     · F1@k         (harmonic mean of the two)
#     · Average-Precision  (area under PR curve per user)
#     · Global PR-Curve    (micro average)
#
#  Prerequisites (already in memory):
#     · predict_user_cf(user, movie)   – rating predictor
#     · predict_item_cf(user, movie)   – rating predictor
#     · train_df , test_df             – split tables
# --------------------------------------------------------------

import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.notebook import tqdm

# ----------- choose which CF flavour to evaluate ---------------
predict_fn   = lambda u, m: predict_user_cf(u, m, k=25)   # or item version
model_name   = "User-CF"
K            = 10     # recommend top-10

# ----------- build ground-truth lookup -------------------------
truth = defaultdict(set)          # user_id  →  set(relevant anime_id)
for row in test_df.itertuples():
    truth[row.user_id].add(row.anime_id)

# ----------- evaluation loop -----------------------------------
precisions = []
recalls    = []
f1s        = []
all_P      = []      # collect precision values for micro PR-curve
all_R      = []      # collect recall values

for u, true_items in tqdm(truth.items(), desc="Eval users"):
    # 1) candidate pool = every item *not* yet rated by user in TRAIN
    seen_items = set(user2movie.get(u, []))
    candidates = [m for m in movie2user if m not in seen_items]

    # 2) score each candidate (small optimisation: random 5k sample
    #    first, then re-score top-500 exactly – helps big datasets)
    scores = {m: predict_fn(u, m) for m in candidates}
    top_k  = sorted(scores, key=scores.get, reverse=True)[:K]

    # 3) per-user metrics
    hits   = len(set(top_k) & true_items)
    P_u    = hits / K
    R_u    = hits / len(true_items) if true_items else 0
    F1_u   = 2*P_u*R_u/(P_u+R_u) if (P_u+R_u) else 0

    precisions.append(P_u)
    recalls.append(R_u)
    f1s.append(F1_u)

    # ----- gather points for micro PR-curve -----
    retrieved = 0
    relevant  = 0
    for rank, m in enumerate(top_k, 1):
        if m in true_items:
            relevant += 1
        retrieved += 1
        all_P.append(relevant/retrieved)
        all_R.append(relevant/len(true_items))

# ----------- macro averages ------------------------------------
P_at_k = np.mean(precisions)
R_at_k = np.mean(recalls)
F1_at_k = np.mean(f1s)

print(f"{model_name}  @{K}")
print("Precision :", round(P_at_k, 3))
print("Recall    :", round(R_at_k, 3))
print("F1-score  :", round(F1_at_k, 3))

# ----------- plot Precision-Recall curve -----------------------
plt.figure(figsize=(6,4))
plt.plot(all_R, all_P, alpha=0.2, label="users")
# micro average line
idx = np.argsort(all_R)
plt.plot(np.array(all_R)[idx], np.array(all_P)[idx].cummax(),
         color="red", linewidth=2, label="micro-average")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"{model_name} PR-curve")
plt.legend(); plt.show()


In [30]:
# Compute user avg
def compute_user_average(user2movie, user_movie2rating):
    """
    compute_user_average is a function that calculates the average rating for each user in the dataset.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :return: a dictionary containing the average rating for each user where the key is the user and the
    value is the average rating.
    """
    user_avg = {}
    for user, movies in user2movie.items():
        ratings = [user_movie2rating[(user, movie)] for movie in movies]
        user_avg[user] = np.mean(ratings)
    return user_avg

In [31]:
user_avg = compute_user_average(user2movie, user_movie2rating)

In [32]:
similarity_matrix = compute_similarity_matrix(user2movie, user_movie2rating, user_avg, min_common=10)

10000


NameError: name 'calculate_pearson_similarity' is not defined

In [None]:
# fucking picle here

with open('user-user-similarity.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

In [8]:
import math
import numpy as np

def uCF(target_user,
        user_movie2rating,
        movie2user,
        user2movie,
        top_n=10,
        k=25,
        min_common=5,
        sim_cutoff=0.05):
    """
    User-based Collaborative Filtering (Pearson).

    Returns the top-N unseen movies for `target_user`,
    each paired with its predicted rating (rounded to 1 dp).

    Parameters
    ----------
    target_user : int
    user_movie2rating : dict[(user, movie) → rating]
    movie2user        : dict[movie → list[user]]
    user2movie        : dict[user → list[movie]]
    top_n   : int   · how many recommendations to return           (default 10)
    k       : int   · neighbourhood size when scoring each movie   (default 25)
    min_common : int · min overlapping movies to compute similarity (default 5)
    sim_cutoff : float · ignore |sim| < sim_cutoff when scoring     (default .05)

    Returns
    -------
    list[(movie_id, predicted_rating)]
    """ 
    
    if target_user not in user2movie:
        raise ValueError("user_id {0} not found".format(target_user))

    # --- 1. per-user mean ---------------------------------------------------
    global_means = {}
    for u, movies in user2movie.items():
        rs = [user_movie2rating[(u, m)]
              for m in movies if (u, m) in user_movie2rating]
        global_means[u] = np.mean(rs) if rs else 0.0

    mu_u = global_means[target_user]
    movies_u = set(user2movie[target_user])

    # --- 2. similarities ----------------------------------------------------
    sims = {}
    for v, movies_v in user2movie.items():
        if v == target_user:
            continue
        common = movies_u & set(movies_v)
        if len(common) < min_common:
            sims[v] = 0.0
            continue

        mu_v = global_means[v]
        num = den_u = den_v = 0.0
        for m in common:
            d_u = user_movie2rating[(target_user, m)] - mu_u
            d_v = user_movie2rating[(v, m)]          - mu_v
            num   += d_u * d_v
            den_u += d_u**2
            den_v += d_v**2
        sims[v] = num / math.sqrt(den_u * den_v) if den_u and den_v else 0.0

    # --- 3. unseen candidate list ------------------------------------------
    candidates = [m for m in movie2user if m not in movies_u]

    # --- 4. score each candidate -------------------------------------------
    preds = {}
    for m in candidates:
        neigh = []
        for v in movie2user[m]:
            if v == target_user:
                continue
            s = sims.get(v, 0.0)
            if abs(s) >= sim_cutoff:
                neigh.append((s, user_movie2rating[(v, m)], v))

        # keep k strongest neighbours (by |similarity|)
        neigh.sort(key=lambda t: abs(t[0]), reverse=True)
        top = neigh[:k]

        num = den = 0.0
        for s, r, v in top:
            num += s * (r - global_means[v])
            den += abs(s)

        if den < 1e-6:
            pred = mu_u                       # back-off to user mean
        else:
            pred = mu_u + num / den

        # clamp to rating scale
        preds[m] = np.round(max(1.0, min(10.0, pred)), 1)

    # --- 5. return top-N ----------------------------------------------------
    recs = sorted(preds.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return recs


In [9]:
rec = uCF(5, user_movie2rating, movie2user, user2movie,min_common=10)

NameError: name 'user_movie2rating' is not defined

In [6]:
# how many ratings has each user made?
ratings_per_user = rating_df.groupby("user_id").size()

# keep only those in the 40–50 range (inclusive)
target_users = ratings_per_user[ratings_per_user.between(40, 70)]

print("Found {0} users with 40–50 ratings:".format(len(target_users)))
for uid, cnt in target_users.items():
    print("  user_id {0:<7d} → {1} ratings".format(uid, cnt))

# if you need the list for further processing:
users_40_50 = target_users.index.tolist()

Found 38401 users with 40–50 ratings:
  user_id 2       → 51 ratings
  user_id 5       → 43 ratings
  user_id 12      → 60 ratings
  user_id 18      → 41 ratings
  user_id 22      → 63 ratings
  user_id 23      → 46 ratings
  user_id 29      → 58 ratings
  user_id 30      → 57 ratings
  user_id 32      → 44 ratings
  user_id 45      → 52 ratings
  user_id 58      → 55 ratings
  user_id 62      → 68 ratings
  user_id 103     → 48 ratings
  user_id 113     → 43 ratings
  user_id 125     → 65 ratings
  user_id 133     → 64 ratings
  user_id 134     → 61 ratings
  user_id 137     → 40 ratings
  user_id 165     → 47 ratings
  user_id 166     → 70 ratings
  user_id 169     → 68 ratings
  user_id 177     → 50 ratings
  user_id 186     → 54 ratings
  user_id 201     → 70 ratings
  user_id 215     → 67 ratings
  user_id 217     → 69 ratings
  user_id 218     → 43 ratings
  user_id 223     → 45 ratings
  user_id 226     → 64 ratings
  user_id 238     → 47 ratings
  user_id 241     → 60 ratings
 

In [None]:
mapping(rec)

Recommended:  Ashita no Joe 2  with predicted rating: 10.0
Recommended:  Tensai? Dr. Hamax  with predicted rating: 10.0
Recommended:  Tantei Shounen Kageman  with predicted rating: 10.0
Recommended:  Kunimatsu-sama no Otoridai  with predicted rating: 10.0
Recommended:  Pic-lele  with predicted rating: 10.0
Recommended:  Sensou ga Owatta Natsu ni 1945 Karafuto  with predicted rating: 10.0
Recommended:  Maya no Isshou  with predicted rating: 10.0
Recommended:  Koe wo Kikasete  with predicted rating: 10.0
Recommended:  Tanbai Haru, Tookaraji  with predicted rating: 10.0
Recommended:  Okaachan Gomen ne  with predicted rating: 10.0


In [20]:
def mapping(rec):
    # ensure anime_id is int for dictionary lookup
    anime_df['anime_id'] = anime_df['anime_id'].astype(int)

    # build ID → name map
    id2name = {int(r['anime_id']): r['anime_name'] for _, r in anime_df.iterrows()}

    # print them out cleanly with numbering
    for idx, (anime_id, pred) in enumerate(rec, start=1):
        aid  = int(anime_id)
        name = id2name.get(aid, "[Missing name for ID {0}]".format(aid))
        print( "Recommended: " , name , " with predicted rating:" , (np.round(pred,1)) )


##Item-Based CF##

In [4]:
def itemCF(target_user,
           user_movie2rating,
           movie2user,
           user2movie,
           top_n=10,
           k=25,
           min_common=5):
    """
    Item-based CF without pivot tables.

    :param target_user:       user_id to recommend for
    :param user_movie2rating: dict[(user, movie) -> rating]
    :param movie2user:        dict[movie -> list of users who rated it]
    :param user2movie:        dict[user -> list of movies they rated]
    :param top_n:             how many recommendations to return
    :param k:                 neighborhood size (# similar items)
    :param min_common:        min # of common raters to compute sim(i,j)
    :return:                  list of (movie_id, predicted_rating)
    """
    if target_user not in rating_df['user_id'].values:
        raise ValueError("'{0}' not found in dataset.".format(target_user))
  
    # 1) compute each user's mean rating for centering sims
    user_means = {}
    for u, movies in user2movie.items():
        vals = [user_movie2rating[(u, m)] for m in movies
                if (u, m) in user_movie2rating]
        user_means[u] = np.mean(vals) if vals else 0.0

    # 2) compute each item's mean rating (for final bias term)
    item_means = {}
    # build a reverse-user-centric view to accumulate
    sums = {}; counts = {}
    for (u, m), r in user_movie2rating.items():
        sums[m]   = sums.get(m, 0.0) + r
        counts[m] = counts.get(m, 0)   + 1
    for m in sums:
        item_means[m] = sums[m] / counts[m]

    # 3) gather the items the target user has seen
    seen = set(user2movie.get(target_user, []))
    mu_u = user_means.get(target_user, 0.0)

    # 4) build candidate list: any movie they haven't rated
    all_movies = set(movie2user.keys())
    candidates = all_movies - seen

    # 5) predict each candidate by looking at its similarity
    preds = {}
    for i in candidates:
        # find overlap users who rated both i and each j ∈ seen
        sims = []
        for j in seen:
            users_i = set(movie2user.get(i, []))
            users_j = set(movie2user.get(j, []))
            common  = users_i & users_j
            if len(common) < min_common:
                continue

            # compute Pearson sim(i,j)
            num = 0.0; den_i = 0.0; den_j = 0.0
            for u in common:
                r_ui = user_movie2rating[(u, i)] - user_means[u]
                r_uj = user_movie2rating[(u, j)] - user_means[u]
                num   += r_ui * r_uj
                den_i += r_ui**2
                den_j += r_uj**2

            if den_i and den_j:
                sim = num / math.sqrt(den_i * den_j)
                sims.append((j, sim))

        if not sims:
            continue

        # pick top-k neighbors by |sim|
        sims.sort(key=lambda x: abs(x[1]), reverse=True)
        topk = sims[:k]

        # weighted sum of the target user's deviations on those neighbors
        num = 0.0; den = 0.0
        for j, sim in topk:
            r_uj = user_movie2rating[(target_user, j)]
            num += sim * (r_uj - item_means[j])
            den += abs(sim)

        # final prediction = item_i mean + weighted deviation
        if den:
            preds[i] = item_means[i] + num/den
        else:
            preds[i] = item_means[i]

    # 6) return the top-n
    recommendations = sorted(preds.items(),
                             key=lambda x: x[1],
                             reverse=True)[:top_n]
    return recommendations

In [None]:
rec = itemCF(0, user_movie2rating, movie2user, user2movie)
rec1 = itemCF(1004, user_movie2rating, movie2user, user2movie)

In [5]:
rating_df.loc[rating_df['user_id']==0].size

140

In [None]:
mapping(rec)

In [None]:
mapping(rec1)

In [None]:
# ensure anime_id is int for dictionary lookup
anime_df['anime_id'] = anime_df['anime_id'].astype(int)

# build ID → name map
id2name = {int(r['anime_id']): r['anime_name'] for _, r in anime_df.iterrows()}

# print them out cleanly with numbering
for idx, (anime_id, pred) in enumerate(rec, start=1):
    aid  = int(anime_id)
    name = id2name.get(aid, "[Missing name for ID {0}]".format(aid))
    print( "Recommended: " , name , " with predicted rating:" , (np.round(pred,1)) )


In [33]:
# creating test and train datasets
training_percentage = 0.8
training_num_datapoints = int(training_percentage * rating_df.shape[0])
rating_training_df = rating_df.iloc[:training_num_datapoints]
rating_test_df = rating_df.iloc[training_num_datapoints:]

In [34]:
def create_data_dictionaries(ratings_data_df, dataset='training'):
    """
    create_data_dictionaries is a function that creates dictionaries for user-movie interactions and ratings.

    :param ratings_data_df: is a dataFrame containing 'user_id', 'movie_id', and 'rating' columns.
    :param dataset: is the type of dataset ('training' or other). For 'training', user-movie mappings are created; 
    otherwise, they are set as empty dictionaries.
    :return: a tuple containing user2movie (dict) which maps each user_id to a list of movie_ids, movie2user (dict) which maps 
    each movie_id to a list of user_ids, and user_movie2rating (dict) which maps (user_id, movie_id) pairs to their corresponding rating.
    """
    if dataset.lower() == 'training':
        user2movie = ratings_data_df.groupby('user_id')['anime_id'].apply(list).to_dict()
        movie2user = ratings_data_df.groupby('anime_id')['user_id'].apply(list).to_dict()
    else:
        user2movie = {}
        movie2user = {}
    
    user_movie = zip(ratings_data_df['user_id'], ratings_data_df['anime_id'])
    user_movie_rating = zip(user_movie, ratings_data_df['rating'])
    user_movie2rating = dict(user_movie_rating)

    return user2movie, movie2user, user_movie2rating

In [35]:
# creating training and testing dicts
user2movie, movie2user, user_movie2rating = create_data_dictionaries(
    rating_training_df
)

_, _, user_movie2rating_test = create_data_dictionaries(
    rating_test_df, dataset='Test'
)

In [36]:
# Compute user avg
def compute_user_average(user2movie, user_movie2rating):
    """
    compute_user_average is a function that calculates the average rating for each user in the dataset.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :return: a dictionary containing the average rating for each user where the key is the user and the
    value is the average rating.
    """
    user_avg = {}
    for user, movies in user2movie.items():
        ratings = [user_movie2rating[(user, movie)] for movie in movies]
        user_avg[user] = np.mean(ratings)
    return user_avg

In [37]:
def calculate_pearson_similarity(user1, user2, user2movie, user_movie2rating, user_avg, min_common):
    """
    calculate_pearson_similarity is a function that calculates the Pearson similarity between user1 and 
    user2 based on their common rated movies. If the number of common movies is less than min_common, 
    similarity is set to 0.
    :param user1: the id of user1
    :param user2: the id of user2
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param min_common: the required minimum number of common movies between user1 and user2.
    :return: the pearson correlation similarity between user1 and user2.
    """
    user1_movies = set(user2movie[user1])
    user2_movies = set(user2movie[user2])
    common_movies = user1_movies.intersection(user2_movies)
    
    if len(common_movies) < min_common:
        return 0  # not enough common items

    numerator = 0
    denominator_user1 = 0
    denominator_user2 = 0
    for movie in common_movies:
        rating_user1 = user_movie2rating[(user1, movie)]
        rating_user2 = user_movie2rating[(user2, movie)]
        
        user1_deviation = rating_user1 - user_avg[user1]
        user2_deviation = rating_user2 - user_avg[user2]
        
        numerator += user1_deviation * user2_deviation
        denominator_user1 += user1_deviation ** 2
        denominator_user2 += user2_deviation ** 2

    if denominator_user1 == 0 or denominator_user2 == 0:
        return 0
    
    return numerator / (np.sqrt(denominator_user1) * np.sqrt(denominator_user2))

In [38]:
# precomputed
def compute_similarity_matrix(user2movie, user_movie2rating, user_avg, min_common):
    """
    compute_similarity_matrix is a function that precomputes the similarity for each pair of users in 
    the training set and saves the similarity scores in a dictionary.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param min_common: the required minimum number of common movies between a pair of movies to be eligible
    for similarity calculation.
    :return: a nested dictionary where the key is a user and the value is a dictionary of the similarity 
    score between the key user and all the other users.
    """
    similarity_matrix = {}
    all_users = list(user2movie.keys())
    print(len(all_users))
    for i, user1 in enumerate(all_users):
        if user1 not in similarity_matrix:
            similarity_matrix[user1] = {}
    
        if (i + 1) % 100 == 0:
            print('{} users processed'.format(i + 1))

        for j in range(i + 1, len(all_users)):
            user2 = all_users[j]
            pearson_similarity = calculate_pearson_similarity(user1, user2, user2movie, user_movie2rating, user_avg, min_common)
            similarity_matrix[user1][user2] = pearson_similarity
            
            if user2 not in similarity_matrix:
                similarity_matrix[user2] = {}
            similarity_matrix[user2][user1] = pearson_similarity

    return similarity_matrix

In [39]:
user_avg = compute_user_average(user2movie, user_movie2rating)

In [None]:
# with open('user-user-similarity.json', 'rb') as f:
#     similarity_matrix = pickle.load(f)

In [40]:
min_common_value = 5

In [41]:
similarity_matrix = compute_similarity_matrix(user2movie, user_movie2rating, user_avg, min_common_value)

8001
100 users processed
200 users processed
300 users processed
400 users processed
500 users processed
600 users processed
700 users processed
800 users processed
900 users processed
1000 users processed
1100 users processed
1200 users processed
1300 users processed
1400 users processed
1500 users processed
1600 users processed
1700 users processed
1800 users processed
1900 users processed
2000 users processed
2100 users processed
2200 users processed
2300 users processed
2400 users processed
2500 users processed
2600 users processed
2700 users processed
2800 users processed
2900 users processed
3000 users processed
3100 users processed
3200 users processed
3300 users processed
3400 users processed
3500 users processed
3600 users processed
3700 users processed
3800 users processed
3900 users processed
4000 users processed
4100 users processed
4200 users processed
4300 users processed
4400 users processed
4500 users processed
4600 users processed
4700 users processed
4800 users proces

In [None]:
# Pickle
with open('user-user-similarity.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

In [43]:
def predict_rating(user, movie, user2movie, user_movie2rating, user_avg, min_common, k_value):
    """
    predict_rating is a function that predicts the rating user "user" would give to movie "movie".
    :param user: user_id
    :pram movie: movie_id
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param min_common: the required minimum number of common movies between a pair of movies to be eligible
    for similarity calculation.
    :param k_value: the number of nearest neighbors to be considered when predicting the user's rating.
    :return: the predicted rating.
    """
    ## find candidate neighbors who rated the movie
    candidates = [other_user for other_user in movie2user[movie] if other_user != user] 
    
    similarities = []
    for other_user in candidates:
        pearson_similarity = similarity_matrix[user][other_user]
        if pearson_similarity != 0:
            similarities.append((other_user, pearson_similarity))
    
    if not similarities:
        return user_avg[user]
    
    ## sort neighbors by the absolute similarity in descending order and select top k neighbors.
    similarities = sorted(similarities, key=lambda x: abs(x[1]), reverse=True)
    top_neighbors = similarities[:k_value]
    
    numerator = 0
    denominator = 0
    for neighbor, similarity in top_neighbors:
        rating_neighbor = user_movie2rating[(neighbor, movie)]
        numerator += similarity * (rating_neighbor - user_avg[neighbor])
        denominator += abs(similarity)
    
    if denominator == 0:
        return user_avg[user]
    
    predicted_rating = user_avg[user] + numerator / denominator
    return predicted_rating

In [44]:
def evaluate_model_rmse(user2movie, user_movie2rating, user_avg, dataset_rating, similarity_matrix, k_value):
    """
    evaluate_model_rmse is a function that uses RMSE evaluation metric to evaluate the performance of the 
    recommendation system.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param dataset_rating: a dictionary containing a mapping from (user, movie) pair to a rating.
    :similarity_matrix: the precomputed matrix of similarity scores between all pairs of users.
    :param k_value: the number of nearest neighbors to be considered when predicting the user's rating.
    :return: the RMSE score.
    """
    
    squared_errors = []
    for ind, ((user, movie), actual_rating) in enumerate(dataset_rating.items()):
        predicted_rating = predict_rating(user, movie, user2movie, user_movie2rating, user_avg, similarity_matrix, k_value)
        squared_errors.append((actual_rating - predicted_rating) ** 2)
        if (ind + 1) % 100 == 0:
            print('{} ratings processed'.format(ind + 1))
    mse = np.mean(squared_errors)
    rmse = np.sqrt(mse)
    return rmse

In [45]:
k_value = 10

In [46]:
train_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating, min_common_value, k_value)

100 ratings processed
200 ratings processed
300 ratings processed
400 ratings processed
500 ratings processed
600 ratings processed
700 ratings processed
800 ratings processed
900 ratings processed
1000 ratings processed
1100 ratings processed
1200 ratings processed
1300 ratings processed
1400 ratings processed
1500 ratings processed
1600 ratings processed
1700 ratings processed
1800 ratings processed
1900 ratings processed
2000 ratings processed
2100 ratings processed
2200 ratings processed
2300 ratings processed
2400 ratings processed
2500 ratings processed
2600 ratings processed
2700 ratings processed
2800 ratings processed
2900 ratings processed
3000 ratings processed
3100 ratings processed
3200 ratings processed
3300 ratings processed
3400 ratings processed
3500 ratings processed
3600 ratings processed
3700 ratings processed
3800 ratings processed
3900 ratings processed
4000 ratings processed
4100 ratings processed
4200 ratings processed
4300 ratings processed
4400 ratings process

In [57]:
test_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating_test, min_common_value, k_value)

100 ratings processed


KeyError: 282345

In [None]:
print('k: {}, min_common: {}, Train RMSE: {}, Test RMSE: {}'.format(k_value, min_common_value, np.round(train_rmse, 3), np.round(test_rmse, 3)))

In [48]:
print(train_rmse)

0.8145694884348088


In [None]:
# k_values = [5, 10, 20, 30, 40, 50]

# results = {}
# for k_value in k_values:
#     train_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating, min_common_value, k_value)
#     test_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating_test, min_common_value, k_value)
#     results[(k_value, min_common_value)] = [train_rmse, test_rmse]




User with most ratings: 189037 (15455 ratings)
Average ratings per user: 185.88


<h1>ITEM eval<h1>

In [58]:
# Compute item avg
def compute_item_average(movie2user, usermovie2rating):
    """
    compute_user_average is a function that calculates the average rating for each user in the dataset.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :return: a dictionary containing the average rating for each item where the key is the item and the
    value is the average rating.
    """
    # Calculate the mean rating for each movie, which serves as a baseline for predictions
    movie_mean = {}
    for movie, users in movie2user.items():
        ratings = []
        for user in users:
            # Retrieve the rating for the (user, movie) pair if it exists
            if (user, movie) in usermovie2rating:
                ratings.append(usermovie2rating[(user, movie)])
        # Compute the average rating for the movie; default to 0 if no ratings exist
        if ratings:
            movie_mean[movie] = sum(ratings) / len(ratings)
        else:
            movie_mean[movie] = 0
    return movie_mean





In [59]:
def adjusted_cosine_similarity(movie_i, movie_j, min_common=5):
    '''
    Calculate the similarity between two movies using an adjusted cosine similarity
    that accounts for the user's average rating.
    :param movie_i: ID of the first movie.
    :param movie_j: ID of the second movie.
    :param min_common: Minimum number of common ratings required to consider similarity.
    :return: The similarity score between movie_i and movie_j.
    '''
    # Get the set of users who rated each movie
    users_i = set(movie2user[movie_i])
    users_j = set(movie2user[movie_j])
    # Identify users who have rated both movies
    common_users = users_i.intersection(users_j)

    # If there are not enough common users, return 0 similarity
    if len(common_users) < min_common:
        return 0

    numerator = 0
    denominator_i = 0
    denominator_j = 0
    # Calculate the adjusted rating differences for common users
    for user in common_users:
        rating_i = user_movie2rating[(user, movie_i)]
        rating_j = user_movie2rating[(user, movie_j)]
        # Subtract the user's mean rating to adjust for individual bias
        difference_i = rating_i - user_avg[user]
        difference_j = rating_j - user_avg[user]
        numerator += difference_i * difference_j
        denominator_i += difference_i ** 2
        denominator_j += difference_j ** 2

    # Avoid division by zero in case of no variation
    if denominator_i == 0 or denominator_j == 0:
        return 0

    # Return the cosine similarity of the adjusted ratings
    return numerator / (math.sqrt(denominator_i) * math.sqrt(denominator_j))

In [60]:
def compute_similarity_matrix_item(min_common=5):
    '''
    Compute a full similarity matrix between all pairs of movies using the adjusted cosine similarity.
    :param min_common: Minimum number of common ratings for similarity computation.
    :return: A dictionary where each key is a movie and the value is another dictionary
             mapping other movies to their similarity scores.
    '''
    # List all movie IDs from the dataset
    movies = list(movie2user.keys())

    similarity_matrix = {}
    # Iterate through each movie to compute similarities
    for ind, movie_i in enumerate(movies):
        similarity_matrix[movie_i] = {}

        # Compare with all movies starting from the current movie to avoid redundancy
        for movie_j in movies[ind:]:
            if movie_i == movie_j:
                similarity_score = 1  # A movie is perfectly similar to itself
            else:
                similarity_score = adjusted_cosine_similarity(movie_i, movie_j, min_common)
            # Store the similarity score in the matrix
            similarity_matrix[movie_i][movie_j] = similarity_score

            # Ensure symmetry by storing the score in both directions
            if movie_i != movie_j:
                if movie_j not in similarity_matrix:
                    similarity_matrix[movie_j] = {}
                similarity_matrix[movie_j][movie_i] = similarity_score

        # Print progress every 100 movies to track computation time
        if ind % 100 == 0 and ind != 0:
            print('{} movies processed'.format(ind + 1))
    return similarity_matrix


In [61]:
movie_mean = compute_item_average(movie2user, user_movie2rating)

In [62]:
min_common_value=5

In [63]:
similarity_matrix = compute_similarity_matrix_item(min_common_value)

101 movies processed
201 movies processed
301 movies processed
401 movies processed
501 movies processed
601 movies processed
701 movies processed
801 movies processed
901 movies processed
1001 movies processed
1101 movies processed
1201 movies processed
1301 movies processed
1401 movies processed
1501 movies processed
1601 movies processed
1701 movies processed
1801 movies processed
1901 movies processed
2001 movies processed
2101 movies processed
2201 movies processed
2301 movies processed
2401 movies processed
2501 movies processed
2601 movies processed
2701 movies processed
2801 movies processed
2901 movies processed
3001 movies processed
3101 movies processed
3201 movies processed
3301 movies processed
3401 movies processed
3501 movies processed
3601 movies processed
3701 movies processed
3801 movies processed
3901 movies processed
4001 movies processed
4101 movies processed
4201 movies processed
4301 movies processed
4401 movies processed
4501 movies processed
4601 movies process

In [64]:
with open('item_similarity_matrix_.pkl', 'wb') as saved_matrix:
    pickle.dump(similarity_matrix, saved_matrix)

In [65]:
def predict_rating_item(user, target_movie, k=25):
    '''
    Predict the rating a user would give to a target movie using a neighborhood-based approach.
    :param user: The user for whom the rating is to be predicted.
    :param target_movie: The movie for which the rating is predicted.
    :param k: The number of top similar movies (neighbors) to use in the prediction.
    :return: The predicted rating.
    '''
    # Retrieve the list of movies that the user has already rated
    rated_movies = user2movie[user]

    similarities = []
    # For each movie the user has rated, find its similarity with the target movie
    for movie in rated_movies:
        # Look up the precomputed similarity in both possible dictionary directions
        if (target_movie in similarity_matrix) and (movie in similarity_matrix[target_movie]):
            similarity = similarity_matrix[target_movie][movie]
        elif (movie in similarity_matrix) and (target_movie in similarity_matrix[movie]):
            similarity = similarity_matrix[movie][target_movie]
        else:
            similarity = 0
        similarities.append((movie, similarity))

    # Sort the movies by the absolute similarity score in descending order
    similarities.sort(key=lambda x: abs(x[1]), reverse=True)
    # Select the top k neighbors
    neighbors = similarities[:k]

    numerator = 0
    denominator = 0
    # Calculate the weighted deviation for each neighbor movie
    for movie, similarity in neighbors:
        rating = user_movie2rating[(user, movie)]
        base = movie_mean[movie]
        deviation = rating - base
        numerator += similarity * deviation
        denominator += abs(similarity)

    # If no similar movies are found, default to the target movie's mean rating
    if denominator == 0:
        return movie_mean[target_movie]

    # Predict the rating by adjusting the target movie's mean by the weighted deviation
    prediction = movie_mean[target_movie] + numerator / denominator
    return prediction

In [66]:
def evaluate_model_rmse_item(dataset_rating, k_value):
    """
    evaluate_model_rmse is a function that uses RMSE evaluation metric to evaluate the performance of the 
    recommendation system.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param dataset_rating: a dictionary containing a mapping from (user, movie) pair to a rating.
    :similarity_matrix: the precomputed matrix of similarity scores between all pairs of users.
    :param k_value: the number of nearest neighbors to be considered when predicting the user's rating.
    :return: the RMSE score.
    """
    
    squared_errors = []
    for ind, ((user, movie), actual_rating) in enumerate(dataset_rating.items()):
        predicted_rating = predict_rating_item(user, movie, k_value)
        squared_errors.append((actual_rating - predicted_rating) ** 2)
        if (ind + 1) % 100 == 0:
            print('{} ratings processed'.format(ind + 1))
    mse = np.mean(squared_errors)
    rmse = np.sqrt(mse)
    return rmse

In [69]:
k_value = 10
train_rmse = evaluate_model_rmse_item(user_movie2rating, k_value)

100 ratings processed
200 ratings processed
300 ratings processed
400 ratings processed
500 ratings processed
600 ratings processed
700 ratings processed
800 ratings processed
900 ratings processed
1000 ratings processed
1100 ratings processed
1200 ratings processed
1300 ratings processed
1400 ratings processed
1500 ratings processed
1600 ratings processed
1700 ratings processed
1800 ratings processed
1900 ratings processed
2000 ratings processed
2100 ratings processed
2200 ratings processed
2300 ratings processed
2400 ratings processed
2500 ratings processed
2600 ratings processed
2700 ratings processed
2800 ratings processed
2900 ratings processed
3000 ratings processed
3100 ratings processed
3200 ratings processed
3300 ratings processed
3400 ratings processed
3500 ratings processed
3600 ratings processed
3700 ratings processed
3800 ratings processed
3900 ratings processed
4000 ratings processed
4100 ratings processed
4200 ratings processed
4300 ratings processed
4400 ratings process

In [68]:
test_rmse = evaluate_model_rmse_item(user_movie2rating_test, k_value)


100 ratings processed


KeyError: 282345

In [None]:
print('k: {}, min_common: {}, Train RMSE: {}, Test RMSE: {}'.format(k_value, min_common_value, np.round(train_rmse, 3), np.round(test_rmse, 3)))

In [70]:
train_rmse

1.0752301163574687

In [None]:
def HybridContent(title, anime_df, top_n=10, alpha=0.7):
    """
    Blend TF-IDF and Jaccard similarities for title.

    Both recommend_by_jaccard() and tf_id_rec() return:
        {"top": pd.Series}
    where Series.index = anime_name and Series.values = similarity.
    
    Steps
    -----
    1.   fetch full similarity vectors from both models
    2.   min-max scale Jaccard scores into [0, 1]
    3.   add TF score + scaled-Jaccard score
    4.   sort and return the top top_n items, with a transparency note
    """
    # -------- 1. get raw similarity Series --------
    tf_ser  = tf_id_rec(title, anime_df, top_n=len(anime_df))["top"]
    jac_ser = recommend_by_jaccard(title, anime_df, top_n=len(anime_df))["top"]

    # -------- 2. scale Jaccard to [0,1] -----------
    if jac_ser.max() != jac_ser.min():                      
        jac_scaled = (jac_ser - jac_ser.min()) / (jac_ser.max() - jac_ser.min())
    else:
        jac_scaled = jac_ser.copy()

    # -------- 3. blend ----------------------------
    union_idxs = (set(tf_ser.index) | set(jac_scaled.index)) - {title}
    hybrid     = {}
    for a in union_idxs:
        hybrid[a] = alpha * tf_ser.get(a, 0.0) + (1-alpha) * jac_scaled.get(a, 0.0)

    # -------- 4. sort & return --------------------
    top = (pd.Series(hybrid)
           .sort_values(ascending=False)
           .head(top_n))

    # transparency: annotate each recommendation
    top.index = [
        "{}  (because you watched “{}”)".format(rec, title)
        for rec in top.index
    ]

    return {"top": top}

In [10]:
def HybridCF(target_user,
             user_movie2rating,
             movie2user,
             user2movie,
             rating_df,
             item_similarity,
             top_n=10,
             k_u=25,
             k_i=25,
             alpha=0.6,
             min_common=5):
    """
    HybridCF = α·(User-based CF) + (1-alpha)·(Item-based CF)

    Both base recommenders already exist:

        uCF()    → list[(movie_id, pred_rating)]
        itemCF() → list[(movie_id, pred_rating)], item_similarity

    Parameters
    ----------
    target_user   : int
    user_movie2rating, movie2user, user2movie : dicts (for uCF)
    rating_df     : pd.DataFrame  (for itemCF)
    item_similarity : pd.DataFrame  pre-computed item-item sims
    top_n         : int  number of movies to return
    k_u, k_i      : int  neighbourhood sizes for uCF / itemCF
    alpha         : float  weight on the *user* component   (0-1)
    min_common    : int  min overlap for Pearson in uCF

    Returns
    -------
    list[(movie_id, hybrid_pred_rating)]  length = top_n
    """

    # -------- 1. raw predictions from each model -----------------
    # ask for a large list so we get scores for many candidates
    u_preds = uCF(target_user,
                  user_movie2rating, movie2user, user2movie,
                  top_n=len(movie2user), k=k_u, min_common=min_common)

    i_preds, _ = itemCF(target_user,
                        rating_df,
                        top_n=len(movie2user),
                        k=k_i,
                        precomputed=item_similarity)

    # convert to dict for fast look-ups
    u_dict = dict(u_preds)     # movie → score
    i_dict = dict(i_preds)

    # -------- 2. weighted combination ----------------------------
    union_movies = set(u_dict) | set(i_dict)

    hybrid_scores = {}
    for m in union_movies:
        score_u = u_dict.get(m)
        score_i = i_dict.get(m)

        if score_u is not None and score_i is not None:
            hybrid = alpha * score_u + (1 - alpha) * score_i
        elif score_u is not None:           # only user-CF predicted
            hybrid = score_u
        else:                               # only item-CF predicted
            hybrid = score_i

        hybrid_scores[m] = round(hybrid, 2)

    # -------- 3. sort and return top-N ---------------------------
    recs = sorted(hybrid_scores.items(),
                  key=lambda x: x[1],
                  reverse=True)[:top_n]

    return recs

In [None]:
HybridCF(36,user_movie2rating,movie2user,user2movie,rating_df,item_similarity)