In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity between vectors
from scipy.spatial.distance import pdist, squareform  # For pairwise distance computations and converting to a square matrix
import pickle
import math

# pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
#CHECK EDA.ipynb
#LOAD pkl files 
# Load anime_df_clean
with open('anime_df_clean.pkl', 'rb') as f:
    anime_df = pickle.load(f)

# Load rating_df_clean
with open('rating_df_clean.pkl', 'rb') as f:
    rating_df = pickle.load(f)

In [3]:
anime_df = pd.read_csv('anime_meta.csv')
rating_df = pd.read_csv('ratings.csv')

In [4]:
anime_df.head()

Unnamed: 0,anime_name,anime_id,Genres,Score,Synopsis
0,Cowboy Bebop,1,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",8.78,"In the year 2071, humanity has colonized sever..."
1,Cowboy Bebop: Tengoku no Tobira,5,"Action, Drama, Mystery, Sci-Fi, Space",8.39,"other day, another bounty—such is the life of ..."
2,Trigun,6,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",8.24,"Vash the Stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,7,"Action, Mystery, Police, Supernatural, Drama, ...",7.27,ches are individuals with special powers like ...
4,Bouken Ou Beet,8,"Adventure, Fantasy, Shounen, Supernatural",6.98,It is the dark century and the people are suff...


In [5]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
0,0,430,9,Fullmetal Alchemist: The Conqueror of Shamballa
1,0,1004,5,Kanojo to Kanojo no Neko
2,0,3010,7,Kaiketsu Zorro
3,0,570,7,Jin-Rou
4,0,2762,9,Igano Kabamaru


In [10]:
def recommend_by_jaccard(
    title,
    anime_df,
    top_n=10,
    precomputed=None  # None, DataFrame 
):
    """
    recommend_by_jaccard is a function that recommends similar anime using Jaccard similarity
    based on either Genres or Themes.

    :param title: str
        The anime title (anime_name) to base recommendations on.

    :param anime_df: pd.DataFrame
        DataFrame containing at least 'anime_name' and the selected feature column ('Genres' or 'Themes').

    :param top_n: int
        Number of top similar results to return.

    :param precomputed: np.ndarray or None
        Optional precomputed Jaccard distance array to avoid recalculating distances.

    :return: dict
        A dictionary with:
            { "top": pd.Series } — the top-N most similar anime and their similarity scores.
    """

    type = 'Genres'
    # Check if title exists in the dataset
    if title not in anime_df['anime_name'].values:
        raise ValueError(f"'{title}' not found in dataset.")

    def compute_jaccard(df, col):
        cross_tab = pd.crosstab(df['anime_name'], df[col])
        distances = pdist(cross_tab.values, metric='jaccard')
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)
    
    def compute_jaccard_array(df, col, arr):
        cross_tab = pd.crosstab(df['anime_name'], df[col])
        distances = arr
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)
        
    df_jaccard = anime_df[["anime_name", type]]


    sim = precomputed
    if sim is None:
        sim = compute_jaccard(df_jaccard, type)
        if title not in sim.index:
            raise ValueError(f"'{title}' not found in similarity data.")
        top = sim.loc[title].sort_values(ascending=False)[1:top_n+1]
        return {"top": top}
        
    else:
        sim_df = compute_jaccard_array(df_jaccard, type, sim)

        if title not in sim_df.index:
            raise ValueError(f"'{title}' not found in similarity data.")
        top = sim_df.loc[title].sort_values(ascending=False)[1:top_n+1]
        return {"top": top}


In [None]:
#If precomputing ALL Similarities
def parse_genres(s):
    # split on comma, strip whitespace, ignore empty
    return set(g.strip() for g in s.split(',') if g.strip())

anime_df['genre_set'] = anime_df['Genres'].apply(parse_genres)

# 4) Prepare lists of IDs (or names) and their genre sets
#    Here we’ll key on anime_id, but you can use anime_name similarly
ids = anime_df['anime_id'].tolist()
genre_sets = anime_df['genre_set'].tolist()

# 5) Initialize an empty DataFrame for the Jaccard matrix
jaccard_df = pd.DataFrame(index=ids, columns=ids, dtype=float)

# 6) Compute pairwise Jaccard similarity
for i, id_i in enumerate(ids):
    set_i = genre_sets[i]
    for j, id_j in enumerate(ids):
        set_j = genre_sets[j]
        inter = set_i & set_j
        union = set_i | set_j
        jaccard_df.at[id_i, id_j] = (len(inter) / len(union)) if union else 0.0

# 7) Save the matrix to a pickle file
output_path = 'jaccard_by_genre.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(jaccard_df, f)

print("Saved Jaccard similarity matrix by genre to {}".format(output_path))

In [None]:
with open('jaccard_by_genre.pkl', 'rb') as f:
    jaccard_df_df = pickle.load(f)

In [None]:
recommend_by_jaccard("Grand Blue", anime_df)

In [5]:
def tf_id_rec(title, anime_df, top_n, precomputed=None):
    """
    tf_id_rec is a function that recommends the most similar anime 
    using TF-IDF cosine similarity based on the Synopsis field.

    :param title: str
        The anime title (anime_name) to base recommendations on.

    :param anime_df: pd.DataFrame
        DataFrame containing at least 'anime_name' and 'Synopsis' columns.

    :param top_n: int
        The number of top similar anime to return (excluding the anime itself).

    :param precomputed: np.ndarray or None
        Optional precomputed cosine similarity matrix. If provided, it will be used instead of recomputing.

    :return: dict
        A dictionary with:
            { "top": pd.Series } — the top-N most similar anime and their similarity scores.
    """
    
# 1) verify the given title actually exists in the dataset
    if title not in anime_df['anime_name'].values:
        raise ValueError("'{0}' not found in dataset.".format(title))

    # 2) select only the columns we need and drop any rows where Synopsis is missing
    df_content = anime_df[['anime_name', 'Synopsis']].dropna(subset=['Synopsis'])

    # 3) build the TF-IDF matrix over all synopses
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')
    tfidf_mat = vectorizer.fit_transform(df_content['Synopsis'])
    tfidf_df  = pd.DataFrame(
        tfidf_mat.toarray(),
        index=df_content['anime_name'],
        columns=vectorizer.get_feature_names_out()
    )

    # 4) if a full cosine-similarity matrix was provided, reuse it
    if precomputed is not None:
        sim_df = pd.DataFrame(
            precomputed,
            index=tfidf_df.index,
            columns=tfidf_df.index
        )
        sims = sim_df.loc[title].sort_values(ascending=False)
        return {"top": sims}

    # 5) otherwise compute similarity between the target and every other anime
    target_vec   = tfidf_df.loc[title].values.reshape(1, -1)
    other_df     = tfidf_df.drop(title, axis=0)
    scores       = cosine_similarity(target_vec, other_df.values)[0]
    result_series = pd.Series(scores, index=other_df.index)

    # 6) pick the top_n highest-scoring titles
    top_similar = result_series.sort_values(ascending=False).iloc[:top_n]

    return {"top": top_similar}


In [7]:
tf_id_rec("Grand Blue", anime_df, 10, precomputed=None)

{'top': anime_name
 I''s Pure                                             0.294371
 Ojisan to Marshmallow                                 0.280287
 Cowboy Bebop: Ein no Natsuyasumi                      0.163125
 Charlotte: Tsuyoimono-tachi                           0.153475
 Ojisan to Marshmallow: Hige-san to Yume Mashmallow    0.126031
 I''s                                                  0.115305
 Akubi wo Suru ni wa Wake ga Aru                       0.114072
 Dive!!                                                0.107146
 Amanchu!                                              0.105728
 B.B. Fish                                             0.105271
 dtype: float64}

<h1>USER BASED CF<h1>

In [4]:
user2movie = rating_df.groupby('user_id')['anime_id'].apply(list).to_dict()
movie2user = rating_df.groupby('anime_id')['user_id'].apply(list).to_dict()
user_movie = zip(rating_df['user_id'], rating_df['anime_id'])
user_movie_rating = zip(user_movie, rating_df['rating'])
user_movie2rating = dict(user_movie_rating)

In [None]:
import math
import numpy as np

def uCF(target_user, user_movie2rating, movie2user, user2movie,
        top_n=10, k=25, min_common=5):
    """
    uCF (User-based Collaborative Filtering) generates top-N movie recommendations
    for `target_user`, returning each recommended movie alongside its predicted rating.

    :param target_user: int
        The ID of the user to recommend for.
    :param user_movie2rating: dict[(user, movie) -> rating]
    :param movie2user: dict[movie -> list of users who rated it]
    :param user2movie: dict[user -> list of movies they've rated]
    :param top_n: int
        How many items to return (default 10).
    :param k: int
        How many similar users to consider when scoring each candidate (default 25).
    :param min_common: int
        Minimum number of co-rated movies to compute a valid similarity (default 5).

    :return: list of (movie_id, predicted_rating)
    """

    if target_user not in rating_df['user_id'].values:
        raise ValueError("'{0}' not found in dataset.".format(target_user))
    
    # 1) compute each user’s mean rating
    global_means = {}
    for user, movies in user2movie.items():
        ratings = [
            user_movie2rating[(user, m)]
            for m in movies
            if (user, m) in user_movie2rating
        ]
        global_means[user] = np.mean(ratings) if ratings else 0.0

    # 2) build similarity scores between target_user and every other user
    mu_t = global_means.get(target_user, 0.0)
    sims = {}
    movies_t = set(user2movie.get(target_user, []))

    for other in user2movie:
        if other == target_user:
            continue
        movies_o = set(user2movie[other])
        common   = movies_t & movies_o
        if len(common) < min_common:
            sims[other] = 0.0
            continue

        mu_o = global_means[other]
        num = 0.0
        dt = 0.0
        do = 0.0
        for m in common:
            r_t = user_movie2rating[(target_user, m)]
            r_o = user_movie2rating[(other, m)]
            d_t = r_t - mu_t
            d_o = r_o - mu_o
            num += d_t * d_o
            dt  += d_t**2
            do  += d_o**2

        sims[other] = (num / math.sqrt(dt * do)) if dt and do else 0.0

    # 3) collect candidate movies (those target_user hasn't rated)
    seen = set(user2movie.get(target_user, []))
    candidates = [m for m in movie2user if m not in seen]

    # 4) predict rating for each candidate
    preds = {}
    for m in candidates:
        neighbors = []
        for u in movie2user[m]:
            if u == target_user:
                continue
            sim = sims.get(u, 0.0)
            if sim != 0:
                neighbors.append((sim, user_movie2rating[(u, m)], u))

        # pick top-k similar users
        neighbors.sort(key=lambda x: x[0], reverse=True)
        top_neighbors = neighbors[:k]

        num = 0.0
        den = 0.0
        for sim, r, u in top_neighbors:
            num += sim * (r - global_means[u])
            den += abs(sim)

        pred = mu_t + (num / den) if den else mu_t
        preds[m] = pred

    # 5) sort and return top-N with predicted ratings
    recommendations = sorted(
        preds.items(),
        key=lambda x: x[1],
        reverse=True
    )[:top_n]

    return recommendations


In [9]:
rec = uCF(0, user_movie2rating, movie2user, user2movie)

In [None]:
print(rec)

In [None]:
#Mapping between anime_ID and anime_name
anime_df['anime_id'] = anime_df['anime_id'].astype(int)
id2name = dict(zip(anime_df['anime_id'], anime_df['anime_name']))

# Now your recommendations will match
for anime_id, score in rec:
    name = id2name.get(anime_id, f"[Missing name for ID {anime_id}]")
    print(name)


Oz no Mahoutsukai no Koutsuu Anzen no Tabi: 10.2
Fullmetal Alchemist: Brotherhood: 9.9
Niji no Kakehashi: 9.8
Neko wa Ikite Iru: 9.7
Nana Moon: 9.7
Nyanpara no Nakama-tachi: 9.7
Nijiiro no Fushigina Ishi: 9.7
Ginga Eiyuu Densetsu: 9.6
Gintama: 9.6
Uchuu Kyoudai: 9.6


##Item-Based CF##

In [None]:
def itemCF(target_user,
           user_movie2rating,
           movie2user,
           user2movie,
           top_n=10,
           k=25,
           min_common=5):
    """
    Item-based CF without pivot tables.

    :param target_user:       user_id to recommend for
    :param user_movie2rating: dict[(user, movie) -> rating]
    :param movie2user:        dict[movie -> list of users who rated it]
    :param user2movie:        dict[user -> list of movies they rated]
    :param top_n:             how many recommendations to return
    :param k:                 neighborhood size (# similar items)
    :param min_common:        min # of common raters to compute sim(i,j)
    :return:                  list of (movie_id, predicted_rating)
    """
    if target_user not in rating_df['user_id'].values:
        raise ValueError("'{0}' not found in dataset.".format(target_user))
  
    # 1) compute each user's mean rating for centering sims
    user_means = {}
    for u, movies in user2movie.items():
        vals = [user_movie2rating[(u, m)] for m in movies
                if (u, m) in user_movie2rating]
        user_means[u] = np.mean(vals) if vals else 0.0

    # 2) compute each item's mean rating (for final bias term)
    item_means = {}
    # build a reverse-user-centric view to accumulate
    sums = {}; counts = {}
    for (u, m), r in user_movie2rating.items():
        sums[m]   = sums.get(m, 0.0) + r
        counts[m] = counts.get(m, 0)   + 1
    for m in sums:
        item_means[m] = sums[m] / counts[m]

    # 3) gather the items the target user has seen
    seen = set(user2movie.get(target_user, []))
    mu_u = user_means.get(target_user, 0.0)

    # 4) build candidate list: any movie they haven't rated
    all_movies = set(movie2user.keys())
    candidates = all_movies - seen

    # 5) predict each candidate by looking at its similarity
    preds = {}
    for i in candidates:
        # find overlap users who rated both i and each j ∈ seen
        sims = []
        for j in seen:
            users_i = set(movie2user.get(i, []))
            users_j = set(movie2user.get(j, []))
            common  = users_i & users_j
            if len(common) < min_common:
                continue

            # compute Pearson sim(i,j)
            num = 0.0; den_i = 0.0; den_j = 0.0
            for u in common:
                r_ui = user_movie2rating[(u, i)] - user_means[u]
                r_uj = user_movie2rating[(u, j)] - user_means[u]
                num   += r_ui * r_uj
                den_i += r_ui**2
                den_j += r_uj**2

            if den_i and den_j:
                sim = num / math.sqrt(den_i * den_j)
                sims.append((j, sim))

        if not sims:
            continue

        # pick top-k neighbors by |sim|
        sims.sort(key=lambda x: abs(x[1]), reverse=True)
        topk = sims[:k]

        # weighted sum of the target user's deviations on those neighbors
        num = 0.0; den = 0.0
        for j, sim in topk:
            r_uj = user_movie2rating[(target_user, j)]
            num += sim * (r_uj - item_means[j])
            den += abs(sim)

        # final prediction = item_i mean + weighted deviation
        if den:
            preds[i] = item_means[i] + num/den
        else:
            preds[i] = item_means[i]

    # 6) return the top-n
    recommendations = sorted(preds.items(),
                             key=lambda x: x[1],
                             reverse=True)[:top_n]
    return recommendations

In [None]:
itemCF(0, user_movie2rating, movie2user, user2movie)

In [None]:
<h2>Evaluation<h2>