In [30]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

In [3]:
anime_recommendations = pd.read_csv('data/anime_recommendations.csv')
anime_information = pd.read_csv('data/anime.csv')
anime_list = pd.read_csv('data/animelist.csv')
anime_ratings = pd.read_csv('data/rating_complete.csv')

In [4]:
anime_recommendations = anime_recommendations.rename(columns= {'Anime' : 'anime_id'})
anime_recommendations.head(5)

Unnamed: 0,anime_id,Recommendation,Agree Votes
0,10,172,9
1,10,4576,7
2,10,261,6
3,10,966,6
4,10,249,5


In [5]:
anime_information = anime_information[['Anime-PlanetID', 'Name', 'Rating Score','Number Votes', 'Tags', 'Episodes', 'Synopsis']]
anime_information = anime_information.rename(columns= {'Anime-PlanetID' : 'anime_id'})
anime_information.head(5)

Unnamed: 0,anime_id,Name,Rating Score,Number Votes,Tags,Episodes,Synopsis
0,10,The Prince of Tennis,4.037,10889,"Comedy, Drama, Shounen, Sports, Tennis, Based ...",178,"Meet Ryoma Echizen, the cocky prince of tennis..."
1,100,Neon Genesis Evangelion,4.248,54463,"Drama, Mecha, Sci Fi, Conspiracy, Kaijuu, Lone...",26,"In the future, a devastating event known as Se..."
2,1000,Full Metal Panic! The Second Raid,4.35,23948,"Action, Comedy, Mecha, Sci Fi, Shounen, Milita...",13,Half a year has passed since Sousuke Sagara to...
3,10000,22/7: Shampoo no Nioi ga Shita,2.8,131,"Idols, School Life, CG Animation",1,The idol group 22/7 perform the song Shampoo n...
4,10001,Oshiri Tantei: Puputto Fumutto Kaiketsu Dance,1.271,21,Family Friendly,1,No synopsis yet - check back soon!


In [6]:
anime_list = anime_list[['user_id', 'anime_id', 'rating', 'watching_status']]
anime_list.head(5)

Unnamed: 0,user_id,anime_id,rating,watching_status
0,0,7173,0.0,4
1,0,5323,0.0,2
2,0,5028,0.0,2
3,0,1048,0.0,4
4,0,12221,0.0,2


In [7]:
anime_ratings.head(5)

Unnamed: 0,user_id,anime_id,rating
0,0,147,5.0
1,1,1512,4.5
2,1,599,4.0
3,1,2292,4.5
4,1,1078,3.0


In [8]:
x = anime_ratings['user_id'].value_counts() > 200
y = x[x].index
anime_ratings = anime_ratings[anime_ratings['user_id'].isin(y)]

In [9]:
user_watch_info_with_ratings = pd.merge(anime_ratings, anime_list, on = ['user_id', 'anime_id', 'rating'])
user_watch_info_with_ratings.head(10)

Unnamed: 0,user_id,anime_id,rating,watching_status
0,6,5760,4.5,1
1,6,3717,4.0,1
2,6,9391,3.5,1
3,6,2123,4.0,1
4,6,286,4.0,1
5,6,8690,5.0,1
6,6,122,3.5,1
7,6,236,4.5,1
8,6,5964,3.0,1
9,6,8608,4.5,1


In [10]:
combined_anime_information = pd.merge(user_watch_info_with_ratings, anime_information, on = 'anime_id')
combined_anime_information['Number Votes'] = (
    combined_anime_information['Number Votes']
    .astype(str)
    .str.replace(',', '', regex=True)
    .replace("Unknown", np.nan)
)
combined_anime_information = combined_anime_information.dropna(subset=['Number Votes'])
combined_anime_information['Number Votes'] = combined_anime_information['Number Votes'].astype(int)
combined_anime_information = combined_anime_information[combined_anime_information['Number Votes'] >= 200]
combined_anime_information.drop_duplicates(['user_id', 'Name'], inplace=True)
combined_anime_information.sample(10)

Unnamed: 0,user_id,anime_id,rating,watching_status,Name,Rating Score,Number Votes,Tags,Episodes,Synopsis
2245266,33437,15781,4.0,1,Horimiya,4.535,12648,"Comedy, Romance, Shounen, Romantic Comedy, Sch...",13,Kyouko and Izumi are two classmates who each l...
2761806,41201,70,3.0,1,Hellsing,4.098,46703,"Action, Horror, Seinen, Conspiracy, England, E...",13,"In present day England, a war is being fought...."
2915933,43467,7458,4.0,1,Magical Girl Raising Project,3.577,3393,"Action, Fantasy, Magical Girl, Battle Royale, ...",12,Magical Girl Raising Project is a popular soci...
1438225,20976,6149,4.5,1,Monthly Girls' Nozaki-kun,4.299,25438,"Comedy, Romance, Shounen, Gag, Manga Industry,...",12,"Chiyo Sakura, a high school girl who fell in l..."
3115465,46262,3059,1.5,1,Spice and Wolf II Specials,3.538,4176,"Fantasy, Animal Characteristics, Economics, Ed...",2,Holo has decided to take it upon herself to ed...
4434063,64937,3031,1.0,1,"The Familiar of Zero: ""Rondo"" of Princesses - ...",3.78,7426,"Comedy, Ecchi, Master-Servant Relationship",1,Since the trip to the great cathedral in Romal...
3068522,45567,7931,3.5,1,Cardcaptor Sakura: Clear Card-hen,4.126,2295,"Fantasy, Magical Girl, Shoujo, Contemporary Fa...",22,Sakura had managed to capture all of the Clow ...
387245,5718,14141,3.5,1,7 Seeds 2nd Season,3.958,1698,"Action, Adventure, Drama, Josei, Person in a S...",12,Second season of 7 Seeds .
3065191,45532,6832,3.5,1,Wolf Girl and Black Prince OVA,3.883,2538,"Shoujo, Based on a Manga",1,No synopsis yet - check back soon!
1608207,23722,3229,4.5,1,Big Windup! The Basics of Basics,3.763,835,"Comedy, Seinen, Sports, Baseball, Based on a M...",1,"Abe wasn’t a fan of Haruna, but there’s a lot ..."


In [15]:
anime_pivot_table = combined_anime_information.pivot_table(columns = "user_id", index = "Name", values = "rating")
anime_pivot_table.fillna(0, inplace = True)
anime_pivot_table

user_id,6,8,10,20,25,26,27,28,37,45,...,79231,79232,79253,79258,79259,79262,79264,79266,79272,79300
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Bungaku Shoujo"" Memoire",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Bungaku Shoujo"" Movie",3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(Making of) Evangelion: Another Impact,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+A-Channel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC Movie: A Midsummer Night's Dream,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xxxHOLiC Rou,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xxxHOLiC Shunmuki,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
your name.,5.0,5.0,5.0,4.5,0.0,4.5,4.5,0.0,0.0,5.0,...,0.0,0.0,0.0,5.0,3.5,4.0,0.0,3.5,5.0,5.0


In [21]:
sparse_pivot_table = csr_matrix(anime_pivot_table)
sparse_pivot_table

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5244236 stored elements and shape (6663, 12447)>

In [25]:
model = NearestNeighbors(algorithm = "brute")
model.fit(sparse_pivot_table)

In [29]:
anime_names = anime_pivot_table.index
pickle.dump(model, open('models/model.pkl', 'wb'))
pickle.dump(anime_names, open('artifacts/anime_name.pkl', 'wb'))
pickle.dump(combined_anime_information, open('artifacts/final_information.pkl', 'wb'))
pickle.dump(anime_pivot_table, open('artifacts/pivot_table.pkl', 'wb'))