Importing library

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

Reading csv anime

In [2]:
anime_df = pd.read_csv('data/anime.csv', index_col=False)
anime_1 = anime_df[['MAL_ID', 'Name', 'Score']]
anime_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   MAL_ID  17562 non-null  int64 
 1   Name    17562 non-null  object
 2   Score   17562 non-null  object
dtypes: int64(1), object(2)
memory usage: 411.7+ KB


Dropping unknown labels

In [3]:
anime = pd.DataFrame({'MAL_ID':[1], 'Name': ['Cowboy Bebop'], 'Score': ['8.78'] })

i = 0

for x in range(1, len(anime_1)):
    if anime_1.iloc[x].Score != 'Unknown':
        i+=1
        anime.loc[i] = anime_1.iloc[x].to_list()

anime['Score'] = anime['Score'].astype('float64')
anime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12421 entries, 0 to 12420
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   MAL_ID  12421 non-null  int64  
 1   Name    12421 non-null  object 
 2   Score   12421 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 388.2+ KB


In [4]:
anime.head(1)

Unnamed: 0,MAL_ID,Name,Score
0,1,Cowboy Bebop,8.78


Taking anime only with 7 + Score

In [5]:
temp_anime = pd.DataFrame({'MAL_ID':[1], 'Name': ['Cowboy Bebop'], 'Score': ['8.78']})
i = 0

for x in range(1, len(anime)):
    if anime.iloc[x].Score >= 7.00:
        i +=1
        temp_anime.loc[i] = anime.iloc[x]


temp_anime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3780 entries, 0 to 3779
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   MAL_ID  3780 non-null   int64 
 1   Name    3780 non-null   object
 2   Score   3780 non-null   object
dtypes: int64(1), object(2)
memory usage: 118.1+ KB


As we picked up 7 + score anime, we don't need Score column and temp dataframe, also we need to change MAL_ID to anime_id as we have to connect it to another bigger dataframe with this column

In [6]:
del anime
anime = temp_anime[['MAL_ID', 'Name']]
anime.reset_index(inplace=True, drop=True)
anime = anime.rename(columns={'MAL_ID' : 'anime_id'})
anime.head(100)

Unnamed: 0,anime_id,Name
0,1,Cowboy Bebop
1,5,Cowboy Bebop: Tengoku no Tobira
2,6,Trigun
3,7,Witch Hunter Robin
4,15,Eyeshield 21
...,...,...
95,147,Kimi ga Nozomu Eien
96,150,Blood+
97,151,Re: Cutey Honey
98,152,Solty Rei


In [7]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3780 entries, 0 to 3779
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  3780 non-null   int64 
 1   Name      3780 non-null   object
dtypes: int64(1), object(1)
memory usage: 59.2+ KB


Reading the Bigger column with rates by watchers

In [8]:
anime_watchers_df = pd.read_csv('data/rating_complete.csv')


In [9]:
anime_watchers_df = anime_watchers_df.head(400000)
anime_watchers_df.reset_index(inplace=True,drop=True)

anime_watchers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   400000 non-null  int64
 1   anime_id  400000 non-null  int64
 2   rating    400000 non-null  int64
dtypes: int64(3)
memory usage: 9.2 MB


In [10]:
anime_watchers_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


We need list of anime ids to take only 7 + of them

In [11]:
anime_watchers = pd.DataFrame()

In [12]:

anime_watchers = pd.merge(anime_watchers_df, anime, on='anime_id')

anime_watchers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311893 entries, 0 to 311892
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   311893 non-null  int64 
 1   anime_id  311893 non-null  int64 
 2   rating    311893 non-null  int64 
 3   Name      311893 non-null  object
dtypes: int64(3), object(1)
memory usage: 11.9+ MB


Checking for null values

In [13]:
anime_watchers.isnull().sum()

user_id     0
anime_id    0
rating      0
Name        0
dtype: int64

Creating pivot table but still we don't need Names, we just save it somewhere for a while . . .

In [14]:
table = pd.pivot_table(anime_watchers, values='rating', columns='Name', index='user_id')

table.head()

Name,"""Bungaku Shoujo"" Memoire","""Bungaku Shoujo"" Movie",.hack//G.U. Trilogy,.hack//Quantum,.hack//The Movie: Sekai no Mukou ni,07-Ghost,11-nin Iru!,3-gatsu no Lion,3-gatsu no Lion 2nd Season,3-gatsu no Lion meets Bump of Chicken,...,ef: A Tale of Melodies. - Prologue,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,ef: A Tale of Memories. - Recollections,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,9.0,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,8.0,9.0,,,8.0


We need now means for all watchers and subtract it from the rate in order to make 0 neutral, not negative rate

Creating function for finding similar anime

In [23]:
def get_similar_anime(anime_name, anime_score):
    similar_score = item_similarity_df[anime_name]*(anime_score)
    similar_score = similar_score.sort_values(ascending=False)

    return similar_score

Creating function for standardizing

In [16]:
def standardize(row):
    new_row = (row - row.mean())/(row.max() - row.min())
    return new_row

table_std = table.apply(standardize)

Filling NaN

In [17]:
table_std.fillna(0, inplace=True)

Implementing cosine similarity

In [24]:
item_similarity = cosine_similarity(table_std.T)
item_similarity_df = pd.DataFrame(item_similarity, index=table.columns, columns=table.columns)
item_similarity_df.head(200)

Name,"""Bungaku Shoujo"" Memoire","""Bungaku Shoujo"" Movie",.hack//G.U. Trilogy,.hack//Quantum,.hack//The Movie: Sekai no Mukou ni,07-Ghost,11-nin Iru!,3-gatsu no Lion,3-gatsu no Lion 2nd Season,3-gatsu no Lion meets Bump of Chicken,...,ef: A Tale of Melodies. - Prologue,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,ef: A Tale of Memories. - Recollections,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Bungaku Shoujo"" Memoire",1.000000,0.449992,0.077268,0.019007,0.022523,0.031480,0.063929,0.022810,0.000267,0.081941,...,0.084786,0.044391,0.005443,0.027430,0.000000,0.058842,0.065309,0.101408,0.076395,0.066788
"""Bungaku Shoujo"" Movie",0.449992,1.000000,0.036838,0.042507,-0.000399,0.015066,0.040884,-0.004411,0.005326,-0.015798,...,0.042501,0.076004,0.053636,0.007858,0.042883,0.080809,0.095826,0.083229,0.085788,0.102067
.hack//G.U. Trilogy,0.077268,0.036838,1.000000,0.244860,0.223819,0.112612,0.082477,0.193434,0.257176,-0.000809,...,0.005632,0.009283,-0.019646,0.274228,0.019675,0.091245,0.112053,0.096234,0.170714,0.128953
.hack//Quantum,0.019007,0.042507,0.244860,1.000000,0.298833,-0.000531,-0.137178,-0.026704,-0.004033,-0.007493,...,0.015412,0.022118,0.045003,0.018249,0.012796,0.106794,0.109664,0.109954,0.160206,0.124533
.hack//The Movie: Sekai no Mukou ni,0.022523,-0.000399,0.223819,0.298833,1.000000,0.015223,0.004819,-0.023694,-0.034214,-0.002623,...,0.018262,0.059345,0.093210,0.023114,0.001112,0.049246,0.074493,-0.043373,0.101011,0.053701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Asobi Asobase Specials,0.000000,0.000000,-0.026701,0.012692,0.000000,0.000000,-0.076260,0.015914,-0.018037,0.131412,...,0.000000,-0.043072,0.000000,0.000000,-0.043732,0.023212,-0.000848,0.006608,-0.030670,-0.017108
Astro Boy: Tetsuwan Atom,0.021590,-0.004776,0.027847,0.027310,0.032361,-0.017194,0.000000,0.068999,0.123735,-0.002286,...,0.015914,0.046074,0.077355,0.141542,0.221437,0.034872,0.102274,0.074283,0.025381,0.138660
Asura,0.020510,-0.009906,0.244212,0.102109,0.067012,0.006496,0.034948,0.123545,0.181973,-0.010782,...,0.049702,0.051413,0.033484,-0.014348,0.046032,-0.020545,-0.002181,0.002755,0.053452,0.105830
Asura Cryin' 2,0.014943,0.022397,0.010022,0.017021,0.027235,0.023463,0.000000,0.011956,-0.017033,-0.001528,...,0.012512,0.043885,0.039457,0.081194,0.163791,0.078924,0.092359,0.118781,0.121077,0.154306


Making list of new users preferences

In [19]:
anime_list = [("Naruto", 10), ('Akame ga Kill!', 8), ('Baki', 9), ('Black Clover', 7) , ('Charlotte', 10), ('Grand Blue', 10), ('Bleach', 6), ('Kimi no Na wa.', 9), ('Death Note', 10), ('Kimetsu no Yaiba', 8), ('Hataraku Maou-sama!', 7), ('Tate no Yuusha no Nariagari', 10), ('One Punch Man', 9)]

Turning to testing

In [25]:
similar_anime = pd.DataFrame()

for anime, rating in anime_list:
    similar_anime = similar_anime.append(get_similar_anime(anime, rating), ignore_index=True)


similar_anime = similar_anime.sum().sort_values(ascending=False)

  similar_anime = similar_anime.append(get_similar_anime(anime, rating), ignore_index=True)
  similar_anime = similar_anime.append(get_similar_anime(anime, rating), ignore_index=True)
  similar_anime = similar_anime.append(get_similar_anime(anime, rating), ignore_index=True)


KeyError: 'Black Clover'

In [26]:
similar_anime.sum().sort_values(ascending=False)

Name
Naruto                                                             12.176066
Baki                                                               10.155953
Akame ga Kill!                                                     10.097779
Naruto: Shippuuden                                                  5.797281
Tokyo Ghoul                                                         5.582982
                                                                     ...    
Aikatsu Stars!                                                     -1.168202
Master Keaton                                                      -1.367042
Tsunpri                                                            -1.578159
Chikan Shita Joshi*sei to Sonogo, Musabori Au Youna Doero Junai    -1.907490
Shirobako Movie                                                    -2.078488
Length: 3674, dtype: float64

Vuala saving result top 100 of them to docx