# Training Collaborative Filtering Model

* Importing the necessary libraries:

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

* Importing the dataset:

In [2]:
anime = pd.read_csv('data/cleaned/anime.csv')
rating = pd.read_csv('data/raw/rating.csv')

* Data Preprocessing:

In [3]:
# Join anime and rating data by anime_id
anime_fulldata=pd.merge(anime,rating,on='anime_id',suffixes= ['', '_user'])
anime_fulldata = anime_fulldata.rename(columns={'name': 'anime_title', 'rating_user': 'user_rating'})
anime_fulldata.head()

Unnamed: 0,anime_id,anime_title,genre,type,episodes,rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,244,10
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,271,10
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,278,-1


In [4]:
# Replace -1 user_rating with NaN
anime_feature = anime_fulldata.copy()
anime_feature['user_rating'].replace(-1, np.nan, inplace=True)
anime_feature.head()

Unnamed: 0,anime_id,anime_title,genre,type,episodes,rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,99,5.0
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,152,10.0
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,244,10.0
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,271,10.0
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,278,


There are users with small number of ratings, so we will remove them from the dataset in order to make the model more accurate because users with a few ratings will be biased. We will take 100 ratings as the threshold.

In [5]:
counts = anime_feature['user_id'].value_counts()
anime_feature = anime_feature[anime_feature['user_id'].isin(counts[counts >= 100].index)]

* Pivot Table: We will create a pivot table with the users as the columns and the animes as the rows. The values will be the ratings. This table will help us calculate the similarity between the animes.

In [6]:
anime_pivot=anime_feature.pivot_table(index='anime_title',columns='user_id',values='user_rating').fillna(0)
anime_pivot.head()

user_id,1,5,7,11,14,17,21,29,38,39,...,73491,73494,73495,73499,73500,73502,73503,73507,73510,73515
anime_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
009 Re:Cyborg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00:08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
1+2=Paradise,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* Save the pivot table to a csv file for later use:

In [7]:
# Save anime_pivot table to a CSV file
anime_pivot.to_csv('models/anime_pivot.csv', index=True)

* Using cosine similarity and NearestNeighbors to find the similar animes:

In [9]:
anime_matrix = csr_matrix(anime_pivot.values)

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(anime_matrix)

In [31]:
def get_recommendations(anime_title, n_recommendations=10):
    anime_list = anime_pivot.index.tolist()
    anime_id = anime_list.index(anime_title)

    distances, indices = model_knn.kneighbors(anime_pivot.iloc[anime_id, :].values.reshape(1, -1), n_neighbors=n_recommendations + 1)

    recommendations = []
    for i in range(0, len(distances.flatten())):
        if i != 0:
            similarity = 1 - distances.flatten()[i]  # Convert distance to similarity
            anime_title = anime_pivot.index[indices.flatten()[i]]
            recommendations.append({
                'Anime': anime_title,
                'Similarity': similarity,
                'Rating': anime.loc[anime["name"] == anime_title, "rating"].values[0],
                'Type': anime.loc[anime["name"] == anime_title, "type"].values[0]
            })

    recommendations_df = pd.DataFrame(recommendations).sort_values('Similarity', ascending=False)

    return recommendations_df

* Examples of Recommendations:

In [32]:
get_recommendations('Ao Haru Ride',10)

Unnamed: 0,Anime,Similarity,Rating,Type
0,Ookami Shoujo to Kuro Ouji,0.584047,7.47,TV
1,Tonari no Kaibutsu-kun,0.533387,7.77,TV
2,Gekkan Shoujo Nozaki-kun,0.527068,8.24,TV
3,Sukitte Ii na yo.,0.52446,7.71,TV
4,Shigatsu wa Kimi no Uso,0.508573,8.92,TV
5,Noragami,0.503133,8.17,TV
6,Ao Haru Ride OVA,0.501029,7.76,OVA
7,Tokyo Ghoul,0.488985,8.07,TV
8,Golden Time,0.488717,7.92,TV
9,Nisekoi,0.477469,7.91,TV


In [33]:
get_recommendations('Guilty Crown',10)

Unnamed: 0,Anime,Similarity,Rating,Type
0,Sword Art Online,0.688495,7.83,TV
1,Mirai Nikki (TV),0.651136,8.07,TV
2,Angel Beats!,0.635036,8.39,TV
3,Ao no Exorcist,0.622729,7.92,TV
4,Shingeki no Kyojin,0.619213,8.54,TV
5,No Game No Life,0.610391,8.47,TV
6,Highschool of the Dead,0.608418,7.46,TV
7,High School DxD,0.602075,7.7,TV
8,Btooom!,0.599621,7.68,TV
9,Another,0.591445,7.88,TV


Based on our experience with animes on the examples above, the recommendations are mostly accurate. The model recommends animes that has similar themes to the ones we like. For example, if we like Ao Haru Ride, the model recommends other animes that are similar to Ao Haru Ride like Ookami Shoujo to Kuro Ouji, Tonari no Kaibutsu-kun, Gekkan Shoujo Nozaki-kun etc. If we like Guilty Crown, the model recommends other animes that are similar to Guilty Crown such as Sword Art Online, Mirai Nikki, Shingeki no Kyojin etc.

* Implementing similarity_measure and diversity_measure functions:

In [39]:
def similarity_measure(anime_list,seed=0):
    np.random.seed(seed)
    similarity_list = []
    anime_samples = np.random.choice(anime_list, 100)
    for sample in anime_samples:
        recommended_animes = get_recommendations(sample)
        similarity_list.extend(recommended_animes['Similarity'].values)
    return np.mean(similarity_list)

In [40]:
#  Calculating the diversity of the recommendations as proportion of genres in the recommendations that are not
#  in the input anime to the total number of genres in the recommendations
def calculate_diversity(input_anime, recommended_animes):
    input_genres = anime[anime['name'] == input_anime]['genre'].iloc[0].split(',')
    recommended_genres = []
    # get all the genres of the recommended animes dataframe by splitting the string of genres
    for i in range(len(recommended_animes)):
        recommended_genres.extend(recommended_animes['genre'].iloc[i].split(','))
    recommended_genres = list(set(recommended_genres))
    # get the genres that are not in the input anime
    genres_not_in_input = [genre for genre in recommended_genres if genre not in input_genres]
    # calculate the diversity
    diversity = len(genres_not_in_input) / len(recommended_genres)
    return diversity

In [41]:
# Get recommendation for random 100 anime from anime_list and calculate mean diversity for them
def diversity_measure(anime_list):
    diversity_list = []
    anime_samples = np.random.choice(anime_list, 100)
    for sample in anime_samples:
        recommended_animes = get_recommendations(sample)
        recommended_animes = anime[anime["name"].isin(recommended_animes["Anime"])]
        diversity = calculate_diversity(sample, recommended_animes)
        diversity_list.append(diversity)
    return np.mean(diversity_list)

In [43]:
# get anime list from anime_pivot table
anime_list = anime_pivot.index.tolist()
diversity_measure(anime_list)

0.732149664896283

In [44]:
similarity_measure(anime_list)

0.3920636960772098