In [66]:
import pandas as pd
from math import sqrt
import numpy as np

In [67]:
animes_df = pd.read_csv('anime.csv')
ratings_df = pd.read_csv('rating.csv')
print(animes_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [68]:
animes_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [69]:
userInput = [{'name':'Kimi no Na wa.', 'rating':9.37},
             {'name':'Fullmetal Alchemist: Brotherhood', 'rating':9.26},
             {'name':'Steins;Gate', 'rating':9.17},
             {'name':'Under World', 'rating':4.28},
             {'name':'Violence Gekiga David no Hoshi', 'rating':4.98}]
inputAnimes = pd.DataFrame(userInput)
print(inputAnimes)

                               name  rating
0                    Kimi no Na wa.    9.37
1  Fullmetal Alchemist: Brotherhood    9.26
2                       Steins;Gate    9.17
3                       Under World    4.28
4    Violence Gekiga David no Hoshi    4.98


In [70]:
inputId = animes_df[animes_df['name'].isin(inputAnimes['name'].tolist())]
inputAnimes = pd.merge(inputId, inputAnimes)
inputAnimes = inputAnimes[['anime_id','name','rating']]
print(inputAnimes)

   anime_id                              name  rating
0     32281                    Kimi no Na wa.    9.37
1      5114  Fullmetal Alchemist: Brotherhood    9.26
2      9253                       Steins;Gate    9.17
3      5543                       Under World    4.28


In [71]:
userSubset = ratings_df[ratings_df['anime_id'].isin(inputAnimes['anime_id'].tolist())]
print(userSubset.groupby('anime_id').count())

          user_id  rating
anime_id                 
5114        24574   24574
5543            4       4
9253        19283   19283
32281        2199    2199


In [72]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['anime_id'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((5114,),          user_id  anime_id  rating
183            3      5114      10
1165          10      5114      10
1250          11      5114       8
1295          12      5114       9
1718          17      5114      10
...          ...       ...     ...
7811580    73499      5114      10
7812050    73500      5114       9
7813047    73504      5114      10
7813249    73507      5114       9
7813665    73515      5114      10

[24574 rows x 3 columns]), ((9253,),          user_id  anime_id  rating
516            5      9253       9
1166          10      9253      -1
1259          11      9253       7
1301          12      9253      10
1400          13      9253      -1
...          ...       ...     ...
7810408    73485      9253       6
7811706    73499      9253      10
7812523    73502      9253      10
7812832    73503      9253       9
7813697    73515      9253       9

[19283 rows x 3 columns]), ((32281,),          user_id  anime_id  rating
8271          99     32281       5
11

In [73]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='anime_id')
    inputAnimes = inputAnimes.sort_values(by='anime_id')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputAnimes[inputAnimes['anime_id'].isin(group['anime_id'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [74]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex   user_id
0         0.003564   (5114,)
1         0.001874   (9253,)
2        -0.021030  (32281,)
3        -0.493742   (5543,)


In [75]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

   similarityIndex   user_id
0         0.003564   (5114,)
1         0.001874   (9253,)
2        -0.021030  (32281,)
3        -0.493742   (5543,)


In [76]:
# Datatypes were not compatable for doing ".merge", so i had to change the type
topUsers['user_id'] = topUsers['user_id'].astype(str).str.extract('(\d+)').astype('int64')

topUsersRating = pd.merge(topUsers, ratings_df, on='user_id', how='inner')
print(topUsersRating.head(100))

    similarityIndex  user_id  anime_id  rating
0          0.003564     5114        20       9
1          0.003564     5114       121       9
2          0.003564     5114       174       6
3          0.003564     5114       205       9
4          0.003564     5114       223       4
..              ...      ...       ...     ...
95        -0.493742     5543       356      10
96        -0.493742     5543       357       7
97        -0.493742     5543       376       8
98        -0.493742     5543       403       9
99        -0.493742     5543       430       9

[100 rows x 4 columns]


In [77]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  user_id  anime_id  rating  weightedRating
0         0.003564     5114        20       9        0.032074
1         0.003564     5114       121       9        0.032074
2         0.003564     5114       174       6        0.021383
3         0.003564     5114       205       9        0.032074
4         0.003564     5114       223       4        0.014255


In [78]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('anime_id').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

          sum_similarityIndex  sum_weightedRating
anime_id                                         
1                   -0.493742           -4.937419
5                   -0.493742           -4.443677
6                   -0.493742           -4.937419
20                  -0.490178           -4.905345
27                  -0.493742           -4.937419


In [79]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['anime_id'] = tempTopUsersRating.index
print(recommendation_df.head(10))

          weighted average recommendation score  anime_id
anime_id                                                 
1                                      10.00000         1
5                                       9.00000         5
6                                      10.00000         6
20                                     10.00727        20
27                                     10.00000        27
30                                      9.00000        30
31                                     10.00000        31
32                                     10.00000        32
57                                      9.00000        57
59                                      9.00000        59


In [80]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

          weighted average recommendation score  anime_id
anime_id                                                 
1535                                  10.040463      1535
205                                   10.007270       205
20                                    10.007270        20
121                                   10.007270       121
4548                                  10.000000      4548
...                                         ...       ...
5530                                  -1.000000      5530
18153                                 -1.000000     18153
11979                                 -1.000000     11979
10620                                 -1.000000     10620
19815                                 -1.000000     19815

[224 rows x 2 columns]


In [81]:
recommended_anime=animes_df.loc[animes_df['anime_id'].isin(recommendation_df['anime_id'])]

#we don't want to recommend the same anime
recommended_anime=recommended_anime.loc[~recommended_anime.anime_id.isin(userSubset['anime_id'])]

print(recommended_anime)

      anime_id                                               name  \
13        2904                 Code Geass: Hangyaku no Lelouch R2   
15         199                      Sen to Chihiro no Kamikakushi   
19        1575                    Code Geass: Hangyaku no Lelouch   
22           1                                       Cowboy Bebop   
24         164                                      Mononoke Hime   
...        ...                                                ...   
7208       964                                      Hit wo Nerae!   
7216      1741                             Hit wo Nerae! Specials   
7694       765                   Chou Henshin Cosprayers Specials   
7714       112                            Chou Henshin Cosprayers   
7844       705  Chou Henshin Cosprayers vs. Ankoku Uchuu Shoug...   

                                                  genre     type episodes  \
13    Action, Drama, Mecha, Military, Sci-Fi, Super ...       TV       25   
15               