# Training Collaborative Filtering Model

* Importing the necessary libraries:

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

* Importing the dataset:

In [2]:
anime = pd.read_csv('data/cleaned/anime.csv')
rating = pd.read_csv('data/raw/rating.csv')

* Data Preprocessing:

In [3]:
# Join anime and rating data by anime_id
anime_fulldata=pd.merge(anime,rating,on='anime_id',suffixes= ['', '_user'])
anime_fulldata = anime_fulldata.rename(columns={'name': 'anime_title', 'rating_user': 'user_rating'})
anime_fulldata.head()

Unnamed: 0,anime_id,anime_title,genre,type,episodes,rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,244,10
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,271,10
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,278,-1


In [4]:
# Replace -1 user_rating with NaN
anime_feature = anime_fulldata.copy()
anime_feature['user_rating'].replace(-1, np.nan, inplace=True)
anime_feature.head()

Unnamed: 0,anime_id,anime_title,genre,type,episodes,rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,99,5.0
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,152,10.0
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,244,10.0
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,271,10.0
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,278,


There are users with small number of ratings, so we will remove them from the dataset in order to make the model more accurate because users with a few ratings will be biased. We will take 100 ratings as the threshold.

In [5]:
counts = anime_feature['user_id'].value_counts()
anime_feature = anime_feature[anime_feature['user_id'].isin(counts[counts >= 100].index)]

* Pivot Table: We will create a pivot table with the users as the columns and the animes as the rows. The values will be the ratings. This table will help us calculate the similarity between the animes.

In [6]:
anime_pivot=anime_feature.pivot_table(index='anime_title',columns='user_id',values='user_rating').fillna(0)
anime_pivot.head()

user_id,1,5,7,11,14,17,21,29,38,39,...,73491,73494,73495,73499,73500,73502,73503,73507,73510,73515
anime_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
009 Re:Cyborg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00:08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
1+2=Paradise,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* Using cosine similarity and NearestNeighbors to find the similar animes:

In [None]:
from scipy.sparse import csr_matrix

anime_matrix = csr_matrix(anime_pivot.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(anime_matrix)

In [None]:
# get the top N nearest neighbors for the anime, sorted by distance
def get_recommendations(anime_title , n_recommendations):
    anime_list = anime_pivot.index.tolist()
    anime_id = anime_list.index(anime_title)
    
    distances , indices = model_knn.kneighbors(anime_pivot.iloc[anime_id,:].values.reshape(1,-1),n_neighbors=n_recommendations+1)
    
    for i in range(0,len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(anime_pivot.index[anime_id]))
        else:
            print('{0}: {1} with distance: {2}'.format(i,anime_pivot.index[indices.flatten()[i]],distances.flatten()[i]))


* Examples of Recommendations:

In [None]:
get_recommendations('Ao Haru Ride',10)

Recommendations for Ao Haru Ride:

1: Ookami Shoujo to Kuro Ouji with distance: 0.4159529572268087
2: Tonari no Kaibutsu-kun with distance: 0.46661280550561446
3: Gekkan Shoujo Nozaki-kun with distance: 0.47293159026954346
4: Sukitte Ii na yo. with distance: 0.47554033076461366
5: Shigatsu wa Kimi no Uso with distance: 0.4914274503381648
6: Noragami with distance: 0.49686670764179086
7: Ao Haru Ride OVA with distance: 0.49897095875647823
8: Tokyo Ghoul with distance: 0.5110150240341418
9: Golden Time with distance: 0.5112831280478941
10: Nisekoi with distance: 0.5225311120570841


In [None]:
get_recommendations('Guilty Crown',10)

Recommendations for Guilty Crown:

1: Sword Art Online with distance: 0.31150499888273586
2: Mirai Nikki (TV) with distance: 0.34886389725880684
3: Angel Beats! with distance: 0.3649639468872513
4: Ao no Exorcist with distance: 0.37727072676869766
5: Shingeki no Kyojin with distance: 0.3807865967575217
6: No Game No Life with distance: 0.3896094511977438
7: Highschool of the Dead with distance: 0.3915822838389993
8: High School DxD with distance: 0.39792463160778524
9: Btooom! with distance: 0.400379038634754
10: Another with distance: 0.40855517657289386


Based on our experience with animes on the examples above, the recommendations are mostly accurate. The model recommends animes that has similar themes to the ones we like. For example, if we like Ao Haru Ride, the model recommends other animes that are similar to Ao Haru Ride like Ookami Shoujo to Kuro Ouji, Tonari no Kaibutsu-kun, Gekkan Shoujo Nozaki-kun etc. If we like Guilty Crown, the model recommends other animes that are similar to Guilty Crown such as Sword Art Online, Mirai Nikki, Shingeki no Kyojin etc.