# Training Content Based Filtering Model

* Importing the necessary libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

* Importing the dataset:

In [2]:
anime = pd.read_csv('data/cleaned/anime.csv')
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64.0,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572
4,9969,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.16,151266


* TF-IDF Vectorizer: TF-IDF stands for Term Frequency-Inverse Document Frequency. It is a technique to quantify a word in documents, we generally compute a weight to each word which signifies the importance of the word in the document and corpus. We will use this method to get numeric vectors for each anime, which we can use to find similar contents.

In [3]:
genres_str = anime['genre'].str.split(',').astype(str)

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 4), min_df=0)
tfidf_matrix = tfidf.fit_transform(genres_str)

tfidf_matrix.shape

(9211, 5155)

We are using the combinations of genre up to 4 elements to get the similarity between the contents. Here are the examples of the combinations:

In [4]:
tfidf.get_feature_names_out()[:10]

array(['action', 'action adventure', 'action adventure cars',
       'action adventure cars comedy', 'action adventure cars mecha',
       'action adventure cars sci', 'action adventure comedy',
       'action adventure comedy demons', 'action adventure comedy drama',
       'action adventure comedy ecchi'], dtype=object)

* Calculating the cosine similarity between each anime pair:

In [5]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
# Save cosine similarity matrix for later use
np.save('models/cosine_sim.npy', cosine_sim)

* Creating a function to get recommendations: Following function will take the name of the anime and will return the top 10 similar anime recommendations. If highest rating flag is set to True, it will return the top 10 highest rated anime from the similar anime list. If similarity flag is set to True, it will show the similarity score of each anime with the given anime.

In [7]:
indices = pd.Series(anime.index, index=anime['name'])

def get_recommendations(title, n_recommendations=10, highest_rating=False, similarity=False):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n_recommendations+1]

    anime_indices = [i[0] for i in sim_scores]

    if similarity == False:
        result_df = pd.DataFrame({'Anime': anime['name'].iloc[anime_indices].values,
                                  'Type': anime['type'].iloc[anime_indices].values,
                                  'Rating': anime['rating'].iloc[anime_indices].values})
    elif similarity == True:
        similarity_ = [(i[1] + 1) / 2 for i in sim_scores]
        result_df = pd.DataFrame({'Anime': anime['name'].iloc[anime_indices].values,
                                  'Similarity': similarity_,
                                  'Type': anime['type'].iloc[anime_indices].values,
                                  'Rating': anime['rating'].iloc[anime_indices].values})
    if highest_rating == True:
        return result_df.sort_values('Rating', ascending=False)
    else:
        return result_df

* Here are some examples of the recommendations:

In [24]:
get_recommendations('Death Note', highest_rating=False, similarity=True)

Unnamed: 0,Anime,Similarity,Type,Rating
0,Mousou Dairinin,0.951487,TV,7.74
1,Higurashi no Naku Koro ni Kai,0.798029,TV,8.41
2,Higurashi no Naku Koro ni,0.759862,TV,8.17
3,Higurashi no Naku Koro ni Rei,0.754682,OVA,7.56
4,Shigofumi,0.734848,TV,7.62
5,Himitsu: The Revelation,0.725537,TV,7.42
6,Hikari to Mizu no Daphne,0.706674,TV,6.87
7,Monster,0.702523,TV,8.72
8,AD Police,0.675006,OVA,6.47
9,Jigoku Shoujo Mitsuganae,0.65981,TV,7.81


In [9]:
get_recommendations('Ao Haru Ride', highest_rating=True, similarity=True)

Unnamed: 0,Anime,Similarity,Type,Rating
1,Kimi ni Todoke,0.90635,TV,8.19
2,Kimi ni Todoke 2nd Season,0.90635,TV,8.17
9,Hana yori Dango,0.804333,TV,7.9
4,Tonari no Kaibutsu-kun,0.863007,TV,7.77
3,Ao Haru Ride OVA,0.90635,OVA,7.76
0,Kareshi Kanojo no Jijou,1.0,TV,7.66
5,Nijiiro Days,0.863007,TV,7.52
6,Nijiiro Days OVA,0.863007,OVA,6.73
7,Chou Kuse ni Narisou,0.863007,TV,6.59
8,Good Morning Call,0.860583,OVA,6.26


In [10]:
#  Calculating the diversity of the recommendations as proportion of genres in the recommendations that are not
#  in the input anime to the total number of genres in the recommendations
def calculate_diversity(input_anime, recommended_animes):
    input_genres = anime[anime['name'] == input_anime]['genre'].iloc[0].split(',')
    recommended_genres = []
    # get all the genres of the recommended animes dataframe by splitting the string of genres
    for i in range(len(recommended_animes)):
        recommended_genres.extend(recommended_animes['genre'].iloc[i].split(','))
    recommended_genres = list(set(recommended_genres))
    # get the genres that are not in the input anime
    genres_not_in_input = [genre for genre in recommended_genres if genre not in input_genres]
    # calculate the diversity
    diversity = len(genres_not_in_input) / len(recommended_genres)
    return diversity

In [11]:
# Get recommendation for random 100 anime from anime_list and calculate mean diversity for them
def diversity_measure(anime_list):
    diversity_list = []
    anime_samples = np.random.choice(anime_list, 100)
    for sample in anime_samples:
        recommended_animes = get_recommendations(sample, highest_rating=True)
        recommended_animes = anime[anime["name"].isin(recommended_animes["Anime"])]
        diversity = calculate_diversity(sample, recommended_animes)
        diversity_list.append(diversity)
    return np.mean(diversity_list)

In [12]:
anime_list = anime['name'].unique()
diversity_measure(anime_list)

0.26190617715617714

* According to our random selected anime recommendations on average %26 of the genres in recommendations are different than the given anime's genres.

In [13]:
def similarity_measure(anime_list,seed=0):
    np.random.seed(seed)
    similarity_list = []
    anime_samples = np.random.choice(anime_list, 100)
    for sample in anime_samples:
        recommended_animes = get_recommendations(sample, similarity=True)
        similarity_list.extend(recommended_animes['Similarity'].values)
    return np.mean(similarity_list)

In [14]:
similarity_measure(anime_list)

0.9103313818076428

Our model is able to recommend similar animes on average with 0.92 similarity score.

* Experimenting with different number of  genres in combinations: We will try to find the best number of genres to use in combinations to get the best recommendations. We will use the same function to get recommendations with different number of genres in combinations.

* Using 5 genres in combinations:

In [15]:
genres_str = anime['genre'].str.split(',').astype(str)

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0)
tfidf_matrix = tfidf.fit_transform(genres_str)

tfidf_matrix.shape

(9211, 7641)

In [16]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

* As we have manipulated the similarity matrix recommendations will be different than the previous ones. Here are some examples of the recommendations:

In [25]:
get_recommendations('Death Note', highest_rating=False, similarity=True)

Unnamed: 0,Anime,Similarity,Type,Rating
0,Mousou Dairinin,0.951487,TV,7.74
1,Higurashi no Naku Koro ni Kai,0.798029,TV,8.41
2,Higurashi no Naku Koro ni,0.759862,TV,8.17
3,Higurashi no Naku Koro ni Rei,0.754682,OVA,7.56
4,Shigofumi,0.734848,TV,7.62
5,Himitsu: The Revelation,0.725537,TV,7.42
6,Hikari to Mizu no Daphne,0.706674,TV,6.87
7,Monster,0.702523,TV,8.72
8,AD Police,0.675006,OVA,6.47
9,Jigoku Shoujo Mitsuganae,0.65981,TV,7.81


We should notice that similarity values dropped because of the increase in the number of genres in combinations.

In [18]:
diversity_measure(anime_list)

0.21508044733044734

In [19]:
similarity_measure(anime_list)

0.9035552025380116

* Using 3 genres in combinations:

In [20]:
genres_str = anime['genre'].str.split(',').astype(str)

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0)
tfidf_matrix = tfidf.fit_transform(genres_str)

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

tfidf_matrix.shape

(9211, 2506)

In [26]:
get_recommendations('Death Note', highest_rating=False, similarity=True)

Unnamed: 0,Anime,Similarity,Type,Rating
0,Mousou Dairinin,0.951487,TV,7.74
1,Higurashi no Naku Koro ni Kai,0.798029,TV,8.41
2,Higurashi no Naku Koro ni,0.759862,TV,8.17
3,Higurashi no Naku Koro ni Rei,0.754682,OVA,7.56
4,Shigofumi,0.734848,TV,7.62
5,Himitsu: The Revelation,0.725537,TV,7.42
6,Hikari to Mizu no Daphne,0.706674,TV,6.87
7,Monster,0.702523,TV,8.72
8,AD Police,0.675006,OVA,6.47
9,Jigoku Shoujo Mitsuganae,0.65981,TV,7.81


In [22]:
similarity_measure(anime_list)

0.9227381733148357

In [23]:
diversity_measure(anime_list)

0.23369766344766343

* Similarity score and diversity of the recommendations did not change much, therefore we can continue to use 4 genres for ngrams.