In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity between vectors
from scipy.spatial.distance import pdist, squareform  # For pairwise distance computations and converting to a square matrix
import pickle
import math

# pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [3]:
#CHECK EDA.ipynb
#LOAD pkl files 
# Load anime_df_clean
with open('anime_df_clean.pkl', 'rb') as f:
    anime_df = pickle.load(f)

# Load rating_df_clean
with open('rating_df_clean.pkl', 'rb') as f:
    rating_df = pickle.load(f)

In [4]:
anime_df.head()

Unnamed: 0,anime_name,anime_id,Genres,Score,Synopsis
0,Cowboy Bebop,1,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",8.78,"In the year 2071, humanity has colonized sever..."
1,Cowboy Bebop: Tengoku no Tobira,5,"Action, Drama, Mystery, Sci-Fi, Space",8.39,"other day, another bounty—such is the life of ..."
2,Trigun,6,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",8.24,"Vash the Stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,7,"Action, Mystery, Police, Supernatural, Drama, ...",7.27,ches are individuals with special powers like ...
4,Bouken Ou Beet,8,"Adventure, Fantasy, Shounen, Supernatural",6.98,It is the dark century and the people are suff...


In [5]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
0,0,430,9,Fullmetal Alchemist: The Conqueror of Shamballa
1,0,1004,5,Kanojo to Kanojo no Neko
2,0,3010,7,Kaiketsu Zorro
3,0,570,7,Jin-Rou
4,0,2762,9,Igano Kabamaru


In [8]:
def recommend_by_jaccard(
    title,
    anime_df,
    top_n=5,
    precomputed=None  # None, DataFrame 
):
    """
    recommend_by_jaccard is a function that recommends similar anime using Jaccard similarity
    based on either Genres or Themes.

    :param title: str
        The anime title (anime_name) to base recommendations on.

    :param anime_df: pd.DataFrame
        DataFrame containing at least 'anime_name' and the selected feature column ('Genres' or 'Themes').

    :param top_n: int
        Number of top similar results to return.

    :param precomputed: np.ndarray or None
        Optional precomputed Jaccard distance array to avoid recalculating distances.

    :return: dict
        A dictionary with:
            { "top": pd.Series } — the top-N most similar anime and their similarity scores.
    """

    type = 'Genres'
    # Check if title exists in the dataset
    if title not in anime_df['anime_name'].values:
        raise ValueError(f"'{title}' not found in dataset.")

    def compute_jaccard(df, col):
        cross_tab = pd.crosstab(df['anime_name'], df[col])
        distances = pdist(cross_tab.values, metric='jaccard')
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)
    
    def compute_jaccard_array(df, col, arr):
        cross_tab = pd.crosstab(df['anime_name'], df[col])
        distances = arr
        similarity = 1 - squareform(distances)
        return pd.DataFrame(similarity, index=cross_tab.index, columns=cross_tab.index)
        
    df_jaccard = anime_df[["anime_name", type]]


    sim = precomputed
    if sim is None:
        sim = compute_jaccard(df_jaccard, type)
        if title not in sim.index:
            raise ValueError(f"'{title}' not found in similarity data.")
        top = sim.loc[title].sort_values(ascending=False)[1:top_n+1]
        return {"top": top}
        
    else:
        sim_df = compute_jaccard_array(df_jaccard, type, sim)

        if title not in sim_df.index:
            raise ValueError(f"'{title}' not found in similarity data.")
        top = sim_df.loc[title].sort_values(ascending=False)[1:top_n+1]
        return {"top": top}


In [None]:
def parse_genres(s):
    # split on comma, strip whitespace, ignore empty
    return set(g.strip() for g in s.split(',') if g.strip())

anime_df['genre_set'] = anime_df['Genres'].apply(parse_genres)

# 4) Prepare lists of IDs (or names) and their genre sets
#    Here we’ll key on anime_id, but you can use anime_name similarly
ids = anime_df['anime_id'].tolist()
genre_sets = anime_df['genre_set'].tolist()

# 5) Initialize an empty DataFrame for the Jaccard matrix
jaccard_df = pd.DataFrame(index=ids, columns=ids, dtype=float)

# 6) Compute pairwise Jaccard similarity
for i, id_i in enumerate(ids):
    set_i = genre_sets[i]
    for j, id_j in enumerate(ids):
        set_j = genre_sets[j]
        inter = set_i & set_j
        union = set_i | set_j
        jaccard_df.at[id_i, id_j] = (len(inter) / len(union)) if union else 0.0

# 7) Save the matrix to a pickle file
output_path = 'jaccard_by_genre.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(jaccard_df, f)

print("Saved Jaccard similarity matrix by genre to {}".format(output_path))

In [None]:
res = recommend_by_jaccard("Grand Blue", anime_df)

In [1]:
def tf_id_rec(title, anime_df, top_n, precomputed=None):
    """
    tf_id_rec is a function that recommends the most similar anime 
    using TF-IDF cosine similarity based on the Synopsis field.

    :param title: str
        The anime title (anime_name) to base recommendations on.

    :param anime_df: pd.DataFrame
        DataFrame containing at least 'anime_name' and 'Synopsis' columns.

    :param top_n: int
        The number of top similar anime to return (excluding the anime itself).

    :param precomputed: np.ndarray or None
        Optional precomputed cosine similarity matrix. If provided, it will be used instead of recomputing.

    :return: dict
        A dictionary with:
            { "top": pd.Series } — the top-N most similar anime and their similarity scores.
    """
    
# 1) verify the given title actually exists in the dataset
    if title not in anime_df['anime_name'].values:
        raise ValueError("'{0}' not found in dataset.".format(title))

    # 2) select only the columns we need and drop any rows where Synopsis is missing
    df_content = anime_df[['anime_name', 'Synopsis']].dropna(subset=['Synopsis'])

    # 3) build the TF-IDF matrix over all synopses
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')
    tfidf_mat = vectorizer.fit_transform(df_content['Synopsis'])
    tfidf_df  = pd.DataFrame(
        tfidf_mat.toarray(),
        index=df_content['anime_name'],
        columns=vectorizer.get_feature_names_out()
    )

    # 4) if a full cosine-similarity matrix was provided, reuse it
    if precomputed is not None:
        sim_df = pd.DataFrame(
            precomputed,
            index=tfidf_df.index,
            columns=tfidf_df.index
        )
        sims = sim_df.loc[title].sort_values(ascending=False)
        return {"top": sims}

    # 5) otherwise compute similarity between the target and every other anime
    target_vec   = tfidf_df.loc[title].values.reshape(1, -1)
    other_df     = tfidf_df.drop(title, axis=0)
    scores       = cosine_similarity(target_vec, other_df.values)[0]
    result_series = pd.Series(scores, index=other_df.index)

    # 6) pick the top_n highest-scoring titles
    top_similar = result_series.sort_values(ascending=False).iloc[:top_n]

    return {"top": top_similar}


In [None]:
tf_id_rec("One Punch Man", anime_df, 10, precomputed=None)

{'top': anime_name
 One Punch Man: Road to Hero                                                                   0.232880
 Urawa no Usagi-chan                                                                           0.209826
 One Punch Man 2nd Season                                                                      0.181862
 The Four Seasons                                                                              0.170357
 Kankou Taisen Saitama: Sakuya no Tatakai                                                      0.154468
 Love Live! School Idol Project: μ's →NEXT LoveLive! 2014 - Endless Parade Makuai Drama        0.138506
 Love Live! School Idol Project: μ's →NEXT LoveLive! 2014 - Endless Parade Encore Animation    0.137069
 One Punch Man Specials                                                                        0.134100
 One Punch Man 2nd Season Specials                                                             0.127306
 Kumo no Gakkou                              

<h1>USER BASED CF<h1>

In [None]:
user2movie = rating_df.groupby('user_id')['anime_id'].apply(list).to_dict()
movie2user = rating_df.groupby('anime_id')['user_id'].apply(list).to_dict()
user_movie = zip(rating_df['user_id'], rating_df['anime_id'])
user_movie_rating = zip(user_movie, rating_df['rating'])
user_movie2rating = dict(user_movie_rating)

In [39]:


def uCF(target_user, user_movie2rating, movie2user, user2movie, top_n=10, k=25, min_common=5):
    """
    uCF (User-based Collaborative Filtering) generates top-N movie recommendations for a given target user 
    using the Pearson correlation similarity between users.

    :param target_user: The ID of the user for whom recommendations are to be generated.
    :param user_movie2rating: Dictionary mapping (user, movie) tuples to their corresponding rating values.
    :param movie2user: Dictionary mapping each movie to the list of users who rated it.
    :param user2movie: Dictionary mapping each user to the list of movies they have rated.
    :param top_n: Number of recommendations to return (default is 10).
    :param k: Number of most similar users (neighbors) to consider for prediction (default is 25).
    :param min_common: Minimum number of common movies required to compute similarity (default is 5).
    
    :return: A list of (movie_id, predicted_rating) tuples representing the top-N recommended movies.
    """
    # Compute global mean ratings for each user
    global_means = {}
    for user, movies in user2movie.items():
        ratings = [user_movie2rating[(user, movie)] for movie in movies if (user, movie) in user_movie2rating]
        global_means[user] = np.round(np.mean(ratings), 2) if ratings else 0.0

    movies_target = set(user2movie[target_user])
    mu_target = global_means[target_user]

    similarity_scores = {}
    for other_user in user2movie:
        if other_user == target_user:
            continue
        movies_other = set(user2movie[other_user])
        common_movies = movies_target.intersection(movies_other)
        if len(common_movies) >= min_common:
            mu_other = global_means[other_user]
            numerator = 0.0
            denom_target = 0.0
            denom_other = 0.0
            for movie in common_movies:
                rt = user_movie2rating[(target_user, movie)]
                ro = user_movie2rating[(other_user, movie)]
                diff_t = rt - mu_target
                diff_o = ro - mu_other
                numerator += diff_t * diff_o
                denom_target += diff_t ** 2
                denom_other += diff_o ** 2
            if denom_target == 0 or denom_other == 0:
                similarity = 0.0
            else:
                similarity = numerator / (math.sqrt(denom_target) * math.sqrt(denom_other))
        else:
            similarity = 0.0
        similarity_scores[other_user] = similarity

    user1_movies = set(user2movie[target_user])
    candidate_movies = []
    for movie in movie2user:
        if movie not in user1_movies:
            candidate_movies.append(movie)

    predicted_ratings = {}
    for movie in candidate_movies:
        # Get the list of users who have rated the candidate movie
        raters = movie2user[movie]

        # Create a list to store tuples of (similarity, rating) for each neighbor
        sim_rating_pairs = []

        # For each user who rated the movie, if they are not the target user, get the similarity and rating
        for user in raters:
            if user != target_user:
                sim = similarity_scores[user]
                if sim != 0:
                    rating = user_movie2rating[(user, movie)]
                    sim_rating_pairs.append((sim, rating, user))

        # Sort the (similarity, rating) pairs in descending order based on similarity
        sim_rating_pairs.sort(key=lambda x: x[0], reverse=True)
        # Select the top k similar users
        top_pairs = sim_rating_pairs[:k]

        # Compute the weighted rating: weighted sum of ratings divided by sum of similarities
        numerator = 0.0
        denominator = 0.0
        for sim, rating, user in top_pairs:
            mu_other = global_means[user]
            numerator += sim * (rating - mu_other)
            denominator += abs(sim)

        # If the denominator is nonzero, compute the predicted rating; otherwise, set it to 0
        if denominator != 0:
            predicted_rating = mu_target + (numerator / denominator)
            predicted_ratings[movie] = np.round(predicted_rating, 1)
        else:
            predicted_ratings[movie] = 0

    recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return recommendations


In [None]:
rec = uCF(0, user_movie2rating, movie2user, user2movie)


In [44]:
# anime_df['anime_id'] = anime_df['anime_id'].astype(int)
# id2name = dict(zip(anime_df['anime_id'], anime_df['anime_name']))

dff['anime_id'] = dff['anime_id'].astype(int)
id2name = dict(zip(dff['anime_id'], dff['anime_name']))

# Now your recommendations will match
for anime_id, score in rec:
    name = id2name.get(anime_id, f"[Missing name for ID {anime_id}]")
    print(f"{name}: {score}")


Oz no Mahoutsukai no Koutsuu Anzen no Tabi: 10.2
Fullmetal Alchemist: Brotherhood: 9.9
Niji no Kakehashi: 9.8
Neko wa Ikite Iru: 9.7
Nana Moon: 9.7
Nyanpara no Nakama-tachi: 9.7
Nijiiro no Fushigina Ishi: 9.7
Ginga Eiyuu Densetsu: 9.6
Gintama: 9.6
Uchuu Kyoudai: 9.6


In [45]:
rec



[(30932, 10.2),
 (5114, 9.9),
 (31004, 9.8),
 (31017, 9.7),
 (33583, 9.7),
 (36312, 9.7),
 (36324, 9.7),
 (820, 9.6),
 (918, 9.6),
 (12431, 9.6)]

<h1>Evaluation<h1>

In [49]:
#start here
training_percentage = 0.8
training_num_datapoints = int(training_percentage * dff.shape[0])

rating_training_df = dff.iloc[:training_num_datapoints]
rating_test_df = dff.iloc[training_num_datapoints:]



In [51]:
def create_data_dictionaries(ratings_data_df, dataset='training'):
    """
    create_data_dictionaries is a function that creates dictionaries for user-movie interactions and ratings.

    :param ratings_data_df: is a dataFrame containing 'user_id', 'movie_id', and 'rating' columns.
    :param dataset: is the type of dataset ('training' or other). For 'training', user-movie mappings are created; 
    otherwise, they are set as empty dictionaries.
    :return: a tuple containing user2movie (dict) which maps each user_id to a list of movie_ids, movie2user (dict) which maps 
    each movie_id to a list of user_ids, and user_movie2rating (dict) which maps (user_id, movie_id) pairs to their corresponding rating.
    """
    if dataset.lower() == 'training':
        user2movie = ratings_data_df.groupby('user_id')['anime_id'].apply(list).to_dict()
        movie2user = ratings_data_df.groupby('anime_id')['user_id'].apply(list).to_dict()
    else:
        user2movie = {}
        movie2user = {}
    
    user_movie = zip(ratings_data_df['user_id'], ratings_data_df['anime_id'])
    user_movie_rating = zip(user_movie, ratings_data_df['rating'])
    user_movie2rating = dict(user_movie_rating)

    return user2movie, movie2user, user_movie2rating

def compute_user_average(user2movie, user_movie2rating):
    """
    compute_user_average is a function that calculates the average rating for each user in the dataset.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :return: a dictionary containing the average rating for each user where the key is the user and the
    value is the average rating.
    """
    user_avg = {}
    for user, movies in user2movie.items():
        ratings = [user_movie2rating[(user, movie)] for movie in movies]
        user_avg[user] = np.mean(ratings)
    return user_avg

def calculate_pearson_similarity(user1, user2, user2movie, user_movie2rating, user_avg, min_common):
    """
    calculate_pearson_similarity is a function that calculates the Pearson similarity between user1 and 
    user2 based on their common rated movies. If the number of common movies is less than min_common, 
    similarity is set to 0.
    :param user1: the id of user1
    :param user2: the id of user2
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param min_common: the required minimum number of common movies between user1 and user2.
    :return: the pearson correlation similarity between user1 and user2.
    """
    user1_movies = set(user2movie[user1])
    user2_movies = set(user2movie[user2])
    common_movies = user1_movies.intersection(user2_movies)
    
    if len(common_movies) < min_common:
        return 0  # not enough common items

    numerator = 0
    denominator_user1 = 0
    denominator_user2 = 0
    for movie in common_movies:
        rating_user1 = user_movie2rating[(user1, movie)]
        rating_user2 = user_movie2rating[(user2, movie)]
        
        user1_deviation = rating_user1 - user_avg[user1]
        user2_deviation = rating_user2 - user_avg[user2]
        
        numerator += user1_deviation * user2_deviation
        denominator_user1 += user1_deviation ** 2
        denominator_user2 += user2_deviation ** 2

    if denominator_user1 == 0 or denominator_user2 == 0:
        return 0
    
    return numerator / (np.sqrt(denominator_user1) * np.sqrt(denominator_user2))

def compute_similarity_matrix(user2movie, user_movie2rating, user_avg, min_common):
    """
    compute_similarity_matrix is a function that precomputes the similarity for each pair of users in 
    the training set and saves the similarity scores in a dictionary.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param min_common: the required minimum number of common movies between a pair of movies to be eligible
    for similarity calculation.
    :return: a nested dictionary where the key is a user and the value is a dictionary of the similarity 
    score between the key user and all the other users.
    """
    similarity_matrix = {}
    all_users = list(user2movie.keys())
    print(len(all_users))
    for i, user1 in enumerate(all_users):
        if user1 not in similarity_matrix:
            similarity_matrix[user1] = {}
    
        if (i + 1) % 100 == 0:
            print('{} users processed'.format(i + 1))

        for j in range(i + 1, len(all_users)):
            user2 = all_users[j]
            pearson_similarity = calculate_pearson_similarity(user1, user2, user2movie, user_movie2rating, user_avg, min_common)
            similarity_matrix[user1][user2] = pearson_similarity
            
            if user2 not in similarity_matrix:
                similarity_matrix[user2] = {}
            similarity_matrix[user2][user1] = pearson_similarity

    return similarity_matrix

def predict_rating(user, movie, user2movie, user_movie2rating, user_avg, min_common, k_value):
    """
    predict_rating is a function that predicts the rating user "user" would give to movie "movie".
    :param user: user_id
    :pram movie: movie_id
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param min_common: the required minimum number of common movies between a pair of movies to be eligible
    for similarity calculation.
    :param k_value: the number of nearest neighbors to be considered when predicting the user's rating.
    :return: the predicted rating.
    """
    ## find candidate neighbors who rated the movie
    candidates = [other_user for other_user in movie2user[movie] if other_user != user] 
    
    similarities = []
    for other_user in candidates:
        pearson_similarity = similarity_matrix[user][other_user]
        if pearson_similarity != 0:
            similarities.append((other_user, pearson_similarity))
    
    if not similarities:
        return user_avg[user]
    
    ## sort neighbors by the absolute similarity in descending order and select top k neighbors.
    similarities = sorted(similarities, key=lambda x: abs(x[1]), reverse=True)
    top_neighbors = similarities[:k_value]
    
    numerator = 0
    denominator = 0
    for neighbor, similarity in top_neighbors:
        rating_neighbor = user_movie2rating[(neighbor, movie)]
        numerator += similarity * (rating_neighbor - user_avg[neighbor])
        denominator += abs(similarity)
    
    if denominator == 0:
        return user_avg[user]
    
    predicted_rating = user_avg[user] + numerator / denominator
    return predicted_rating

def evaluate_model_rmse(user2movie, user_movie2rating, user_avg, dataset_rating, similarity_matrix, k_value):
    """
    evaluate_model_rmse is a function that uses RMSE evaluation metric to evaluate the performance of the 
    recommendation system.
    :param user2movie: a dictionary that maps each user to a list of movie_ids.
    :param user_movie2rating: a dictionary that maps each (user, movie) pair to a rating.
    :param user_avg: a dictionary of the average ratings for each user.
    :param dataset_rating: a dictionary containing a mapping from (user, movie) pair to a rating.
    :similarity_matrix: the precomputed matrix of similarity scores between all pairs of users.
    :param k_value: the number of nearest neighbors to be considered when predicting the user's rating.
    :return: the RMSE score.
    """
    
    squared_errors = []
    for ind, ((user, movie), actual_rating) in enumerate(dataset_rating.items()):
        predicted_rating = predict_rating(user, movie, user2movie, user_movie2rating, user_avg, similarity_matrix, k_value)
        squared_errors.append((actual_rating - predicted_rating) ** 2)
        if (ind + 1) % 100 == 0:
            print('{} ratings processed'.format(ind + 1))
    mse = np.mean(squared_errors)
    rmse = np.sqrt(mse)
    return rmse

In [53]:
user2movie, movie2user, user_movie2rating = create_data_dictionaries(
    rating_training_df
)

_, _, user_movie2rating_test = create_data_dictionaries(
    rating_test_df, dataset='Test'
)

user_avg = compute_user_average(user2movie, user_movie2rating)

min_common_value = 5

similarity_matrix = compute_similarity_matrix(user2movie, user_movie2rating, user_avg, min_common_value)

k_value = 10

train_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating, min_common_value, k_value)

test_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating_test, min_common_value, k_value)

print('k: {}, min_common: {}, Train RMSE: {}, Test RMSE: {}'.format(k_value, min_common_value, np.round(train_rmse, 3), np.round(test_rmse, 3)))

k_values = [5, 10, 20, 30, 40, 50]

results = {}
for k_value in k_values:
    train_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating, min_common_value, k_value)
    test_rmse = evaluate_model_rmse(user2movie, user_movie2rating, user_avg, user_movie2rating_test, min_common_value, k_value)
    results[(k_value, min_common_value)] = [train_rmse, test_rmse]

print(
    'k: {}, min_common: {}, Train RMSE: {}, Test RMSE: {}'.format(
        k_value, min_common_value, np.round(train_rmse, 3), np.round(test_rmse, 3)
    )
)

best_params = min(results.items(), key=lambda x: x[1][1])
best_k, best_min_common = best_params[0]
best_train_rmse, best_test_rmse = best_params[1]

print("Best hyperparameters:")
print("k: {}, min_common: {} with Train RMSE: {}, Test RMSE: {}".format(best_k, best_min_common, np.round(best_train_rmse, 3), np.round(best_test_rmse, 3)))


MemoryError: 