# Hybrid Model - Content Based + Collaborative Filtering

* Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

* Importing Datasets

In [2]:
anime = pd.read_csv('data/cleaned/anime.csv')
rating = pd.read_csv('data/raw/rating.csv')

## Content Based Filtering Recommendation System

* Importing the Preprocessed Model Data for Content Based Filtering

In [3]:
# load the cosine similarity matrix
cosine_sim = np.load('models/cosine_sim.npy')

In [4]:
indices = pd.Series(anime.index, index=anime['name'])

def get_recommendations_content(title, n_recommendations=10, highest_rating=False, similarity=True):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n_recommendations+1]

    anime_indices = [i[0] for i in sim_scores]

    if similarity == False:
        result_df = pd.DataFrame({'Anime': anime['name'].iloc[anime_indices].values,
                                  'Type': anime['type'].iloc[anime_indices].values,
                                  'Rating': anime['rating'].iloc[anime_indices].values})
    elif similarity == True:
        similarity_ = [(i[1] + 1) / 2 for i in sim_scores]
        result_df = pd.DataFrame({'Anime': anime['name'].iloc[anime_indices].values,
                                  'Similarity': similarity_,
                                  'Type': anime['type'].iloc[anime_indices].values,
                                  'Rating': anime['rating'].iloc[anime_indices].values})
    if highest_rating == True:
        return result_df.sort_values('Rating', ascending=False)
    else:
        return result_df

* Let's test it

In [5]:
# test the function for a random anime
get_recommendations_content('One Piece', highest_rating=False, similarity=True)

Unnamed: 0,Anime,Similarity,Type,Rating
0,Guilty Crown: Lost Christmas,1.0,OVA,7.1
1,The Animatrix,0.841428,OVA,7.34
2,Shangri-La,0.841428,TV,7.16
3,Buki yo Saraba,0.841428,Movie,7.15
4,Kite Liberator,0.841428,OVA,6.52
5,Eightman After,0.841428,OVA,6.38
6,Compiler Festa,0.841428,OVA,5.77
7,s.CRY.ed,0.840462,TV,7.45
8,R.O.D the TV,0.810142,TV,7.64
9,Terra e...,0.787547,Movie,6.63


## Collaborative Filtering Recommendation System

* Importing the Preprocessed Model Data for Collaborative Filtering

In [6]:
# Load anime_pivot table from a CSV file
anime_pivot = pd.read_csv('models/anime_pivot.csv', index_col=0)

* Fit the NearestNeighbors Model for Collaborative Filtering

In [7]:
anime_matrix = csr_matrix(anime_pivot.values)

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(anime_matrix)

In [8]:
def get_recommendations_collaborative(anime_title, n_recommendations):
    anime_list = anime_pivot.index.tolist()
    anime_id = anime_list.index(anime_title)

    distances, indices = model_knn.kneighbors(anime_pivot.iloc[anime_id, :].values.reshape(1, -1), n_neighbors=n_recommendations + 1)

    recommendations = []
    for i in range(0, len(distances.flatten())):
        if i != 0:
            similarity = 1 - distances.flatten()[i]  # Convert distance to similarity
            anime_title = anime_pivot.index[indices.flatten()[i]]
            recommendations.append({
                'Anime': anime_title,
                'Similarity': similarity,
                'Rating': anime.loc[anime["name"] == anime_title, "rating"].values[0],
                'Type': anime.loc[anime["name"] == anime_title, "type"].values[0]
            })

    recommendations_df = pd.DataFrame(recommendations).sort_values('Similarity', ascending=False)

    return recommendations_df

* Let's test it

In [9]:
get_recommendations_collaborative('Ao Haru Ride',10)

Unnamed: 0,Anime,Similarity,Rating,Type
0,Ookami Shoujo to Kuro Ouji,0.584047,7.47,TV
1,Tonari no Kaibutsu-kun,0.533387,7.77,TV
2,Gekkan Shoujo Nozaki-kun,0.527068,8.24,TV
3,Sukitte Ii na yo.,0.52446,7.71,TV
4,Shigatsu wa Kimi no Uso,0.508573,8.92,TV
5,Noragami,0.503133,8.17,TV
6,Ao Haru Ride OVA,0.501029,7.76,OVA
7,Tokyo Ghoul,0.488985,8.07,TV
8,Golden Time,0.488717,7.92,TV
9,Nisekoi,0.477469,7.91,TV


## Hybrid Model Recommendation System

In [13]:
from sklearn.preprocessing import MinMaxScaler

def get_recommendations_hybrid(anime_title, n_recommendations=10):
    # Content-based filtering
    content_recommendations = get_recommendations_content(anime_title, n_recommendations).reset_index(drop=True)

    # Collaborative filtering
    collaborative_recommendations = get_recommendations_collaborative(anime_title, n_recommendations).reset_index(drop=True)

    # Scale the similarity values from both approaches to a range of 0 to 1 and assign it
    # to a new column called 'Scaled Similarity'
    scaler = MinMaxScaler()
    scaled_similarity = scaler.fit_transform(collaborative_recommendations[['Similarity']])
    collaborative_recommendations['Scaled Similarity'] = scaled_similarity
    scaled_similarity = scaler.fit_transform(content_recommendations[['Similarity']])
    content_recommendations['Scaled Similarity'] = scaled_similarity
    #add a new column called 'Recommendation Type' to indicate whether the recommendation is from content-based
    #or collaborative filtering
    content_recommendations['Recommendation Type'] = 'Content-based'
    collaborative_recommendations['Recommendation Type'] = 'Collaborative'

    # Create the pool by concatenating the recommendations from both approaches and their combined similarity
    # drop duplicates(with the same anime name) from the pool
    pool = pd.concat([content_recommendations, collaborative_recommendations], ignore_index=True).drop_duplicates("Anime")
    # Sort the pool based on combined similarity
    pool = pool.sort_values('Scaled Similarity', ascending=False)

    # Select the top N recommendations
    top_recommendations = pool.head(n_recommendations)

    # Return the top recommendations
    return top_recommendations[['Anime', 'Similarity', 'Type', 'Rating','Recommendation Type']]

* Let's test it

In [14]:
get_recommendations_hybrid('Death Note', 10)

Unnamed: 0,Anime,Similarity,Type,Rating,Recommendation Type
0,Mousou Dairinin,0.939736,TV,7.74,Content-based
10,Code Geass: Hangyaku no Lelouch,0.752515,TV,8.83,Collaborative
11,Code Geass: Hangyaku no Lelouch R2,0.730489,TV,8.98,Collaborative
12,Elfen Lied,0.704465,TV,7.85,Collaborative
13,Fullmetal Alchemist: Brotherhood,0.698455,TV,9.26,Collaborative
14,Shingeki no Kyojin,0.697792,TV,8.54,Collaborative
1,Higurashi no Naku Koro ni Kai,0.734143,TV,8.41,Content-based
15,Angel Beats!,0.677316,TV,8.39,Collaborative
2,Higurashi no Naku Koro ni,0.697524,TV,8.17,Content-based
16,Sword Art Online,0.672457,TV,7.83,Collaborative


In [15]:
get_recommendations_hybrid('Ao Haru Ride', 10)

Unnamed: 0,Anime,Similarity,Type,Rating,Recommendation Type
0,Kareshi Kanojo no Jijou,1.0,TV,7.66,Content-based
10,Ookami Shoujo to Kuro Ouji,0.584047,TV,7.47,Collaborative
2,Kimi ni Todoke 2nd Season,0.90635,TV,8.17,Content-based
3,Ao Haru Ride OVA,0.90635,OVA,7.76,Content-based
1,Kimi ni Todoke,0.90635,TV,8.19,Content-based
12,Gekkan Shoujo Nozaki-kun,0.527068,TV,8.24,Collaborative
13,Sukitte Ii na yo.,0.52446,TV,7.71,Collaborative
4,Tonari no Kaibutsu-kun,0.863007,TV,7.77,Content-based
5,Nijiiro Days,0.863007,TV,7.52,Content-based
6,Nijiiro Days OVA,0.863007,OVA,6.73,Content-based


In [16]:
get_recommendations_hybrid('Shingeki no Kyojin', 10)

Unnamed: 0,Anime,Similarity,Type,Rating,Recommendation Type
0,Shingeki no Kyojin OVA,1.0,OVA,7.88,Content-based
1,Shingeki no Kyojin Movie 2: Jiyuu no Tsubasa,1.0,Movie,7.75,Content-based
10,Sword Art Online,0.774741,TV,7.83,Collaborative
2,Shingeki no Kyojin Movie 1: Guren no Yumiya,1.0,Movie,7.7,Content-based
11,Tokyo Ghoul,0.71097,TV,8.07,Collaborative
12,No Game No Life,0.710895,TV,8.47,Collaborative
3,One Piece,0.83227,TV,8.58,Content-based
13,Mirai Nikki (TV),0.701564,TV,8.07,Collaborative
14,Death Note,0.697792,TV,8.71,Collaborative
15,Angel Beats!,0.688817,TV,8.39,Collaborative


In [17]:
get_recommendations_hybrid('Sword Art Online', 10)

Unnamed: 0,Anime,Similarity,Type,Rating,Recommendation Type
0,Sword Art Online II,1.0,TV,7.35,Content-based
10,Shingeki no Kyojin,0.774741,TV,8.54,Collaborative
11,No Game No Life,0.74223,TV,8.47,Collaborative
13,Angel Beats!,0.735735,TV,8.39,Collaborative
14,Mirai Nikki (TV),0.72418,TV,8.07,Collaborative
2,Bakugan Battle Brawlers: Mechtanium Surge,0.780398,TV,6.29,Content-based
1,Bakugan Battle Brawlers: Gundalian Invaders,0.780398,TV,6.32,Content-based
15,Highschool of the Dead,0.700171,TV,7.46,Collaborative
16,Ao no Exorcist,0.693399,TV,7.92,Collaborative
17,Guilty Crown,0.688495,TV,7.81,Collaborative


* Implementing similarity_measure and diversity_measure functions:

In [23]:
def similarity_measure(anime_list,seed=0):
    np.random.seed(seed)
    similarity_list = []
    anime_samples = np.random.choice(anime_list, 100)
    for sample in anime_samples:
        recommended_animes = get_recommendations_hybrid(sample)
        similarity_list.extend(recommended_animes['Similarity'].values)
    return np.mean(similarity_list)

In [20]:
#  Calculating the diversity of the recommendations as proportion of genres in the recommendations that are not
#  in the input anime to the total number of genres in the recommendations
def calculate_diversity(input_anime, recommended_animes):
    input_genres = anime[anime['name'] == input_anime]['genre'].iloc[0].split(',')
    recommended_genres = []
    # get all the genres of the recommended animes dataframe by splitting the string of genres
    for i in range(len(recommended_animes)):
        recommended_genres.extend(recommended_animes['genre'].iloc[i].split(','))
    recommended_genres = list(set(recommended_genres))
    # get the genres that are not in the input anime
    genres_not_in_input = [genre for genre in recommended_genres if genre not in input_genres]
    # calculate the diversity
    diversity = len(genres_not_in_input) / len(recommended_genres)
    return diversity

In [21]:
# Get recommendation for random 100 anime from anime_list and calculate mean diversity for them
def diversity_measure(anime_list):
    diversity_list = []
    anime_samples = np.random.choice(anime_list, 100)
    for sample in anime_samples:
        recommended_animes = get_recommendations_hybrid(sample)
        recommended_animes = anime[anime["name"].isin(recommended_animes["Anime"])]
        diversity = calculate_diversity(sample, recommended_animes)
        diversity_list.append(diversity)
    return np.mean(diversity_list)

In [22]:
# get anime list from anime_pivot table
anime_list = anime_pivot.index.tolist()
diversity_measure(anime_list)

0.642040090327629

In [24]:
similarity_measure(anime_list)

0.6161096394638744

* It seems like the hybrid model is able to inherit strong points of both content based and collaborative filtering models. It is able to recommend animes that has similar themes, and it is also able to recommend animes that are popular among other users with similar taste. This helps to recommend a diverse set of animes to the user.