In [103]:
import pandas as pd
from math import sqrt
import numpy as np

In [104]:
anime_df = pd.read_csv('anime.csv')
rating_df = pd.read_csv('rating.csv')
print(anime_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [105]:
userInput = [{'name':'Mobile Suit Gundam: The Origin', 'rating':8.42},
             {'name':'Mobile Suit Gundam Unicorn', 'rating':8.4},
             {'name':'Mobile Suit Gundam Thunderbolt', 'rating':8},
             {'name':'Mobile Suit Gundam: Iron-Blooded Orphans', 'rating':7.96},
             {'name':'Turn A Gundam', 'rating':7.76}]
inputAnimes = pd.DataFrame(userInput)
print(inputAnimes)

                                       name  rating
0            Mobile Suit Gundam: The Origin    8.42
1                Mobile Suit Gundam Unicorn    8.40
2            Mobile Suit Gundam Thunderbolt    8.00
3  Mobile Suit Gundam: Iron-Blooded Orphans    7.96
4                             Turn A Gundam    7.76


In [106]:
inputId = anime_df[anime_df['name'].isin(inputAnimes['name'].tolist())]
inputAnimes = pd.merge(inputId, inputAnimes)
inputAnimes = inputAnimes[['anime_id','name','rating']]
print(inputAnimes)

   anime_id                                      name  rating
0     10937            Mobile Suit Gundam: The Origin    8.42
1      6336                Mobile Suit Gundam Unicorn    8.40
2     31973            Mobile Suit Gundam Thunderbolt    8.00
3     31251  Mobile Suit Gundam: Iron-Blooded Orphans    7.96
4        95                             Turn A Gundam    7.76


In [107]:
userSubset = rating_df[rating_df['anime_id'].isin(inputAnimes['anime_id'].tolist())]
print(userSubset.groupby('anime_id').count())

          user_id  rating
anime_id                 
95            489     489
6336          838     838
31251        1207    1207
31973         304     304


In [108]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['anime_id'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((31251,),          user_id  anime_id  rating
3813          39     31251      -1
5988          54     31251      -1
9966         123     31251      10
13248        166     31251       8
15444        198     31251      -1
...          ...       ...     ...
7763833    73099     31251      10
7778924    73206     31251      10
7779003    73208     31251       6
7781934    73234     31251       6
7789323    73288     31251      10

[1207 rows x 3 columns]), ((6336,),          user_id  anime_id  rating
8919         109      6336      10
18775        234      6336       7
21554        261      6336       7
31663        352      6336       8
39601        435      6336       9
...          ...       ...     ...
7769119    73135      6336       8
7778805    73206      6336       9
7781527    73234      6336       8
7783991    73255      6336       8
7792648    73320      6336       8

[838 rows x 3 columns]), ((95,),          user_id  anime_id  rating
14111        183        95       9
19905  

In [109]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='anime_id')
    inputAnimes = inputAnimes.sort_values(by='anime_id')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputAnimes[inputAnimes['anime_id'].isin(group['anime_id'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [110]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex   user_id
0        -0.061062  (31251,)
1         0.028134   (6336,)
2         0.036062     (95,)
3         0.010786  (31973,)


In [111]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

   similarityIndex   user_id
2         0.036062     (95,)
1         0.028134   (6336,)
3         0.010786  (31973,)
0        -0.061062  (31251,)


In [112]:
# The Datatypes needed to be change due to the incompatibily of doing the ".merge"
topUsers['user_id'] = topUsers['user_id'].astype(str).str.extract('(\d+)').astype('int64')

topUsersRating = pd.merge(topUsers, rating_df, on='user_id', how='inner')
print(topUsersRating.head(100))

  topUsers['user_id'] = topUsers['user_id'].astype(str).str.extract('(\d+)').astype('int64')


    similarityIndex  user_id  anime_id  rating
0          0.036062       95        20       7
1          0.036062       95        43       8
2          0.036062       95       164       9
3          0.036062       95       185       6
4          0.036062       95       186       6
..              ...      ...       ...     ...
95         0.036062       95     16417      10
96         0.036062       95     16592      -1
97         0.036062       95     18229      -1
98         0.028134     6336         1       9
99         0.028134     6336         5       8

[100 rows x 4 columns]


In [113]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  user_id  anime_id  rating  weightedRating
0         0.036062       95        20       7        0.252434
1         0.036062       95        43       8        0.288496
2         0.036062       95       164       9        0.324558
3         0.036062       95       185       6        0.216372
4         0.036062       95       186       6        0.216372


In [114]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('anime_id').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

          sum_similarityIndex  sum_weightedRating
anime_id                                         
1                   -0.032928            0.314266
5                   -0.032928            0.286132
6                    0.028134            0.225070
15                   0.038920            0.332933
20                   0.003134            0.482299


In [115]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Taking the Weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['anime_id'] = tempTopUsersRating.index
print(recommendation_df.head(10))

          weighted average recommendation score  anime_id
anime_id                                                 
1                                     -9.543934         1
5                                     -8.689541         5
6                                      8.000000         6
15                                     8.554278        15
20                                   153.908381        20
30                                    -6.126361        30
31                                    -1.000000        31
32                                    -1.000000        32
43                                   -13.982282        43
44                                     9.000000        44


In [116]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

          weighted average recommendation score  anime_id
anime_id                                                 
9756                                 158.968419      9756
20                                   153.908381        20
19815                                 10.000000     19815
4181                                  10.000000      4181
550                                   10.000000       550
...                                         ...       ...
199                                  -16.867233       199
14513                                -17.793937     14513
1575                                 -19.064538      1575
10620                                -32.181407     10620
11111                                -37.255624     11111

[247 rows x 2 columns]


In [117]:
recommended_anime=anime_df.loc[anime_df['anime_id'].isin(recommendation_df['anime_id'])]

#we don't want to recommend the same movie
recommended_anime=recommended_anime.loc[~recommended_anime.anime_id.isin(userSubset['anime_id'])]

print(recommended_anime)

       anime_id                                name  \
1          5114    Fullmetal Alchemist: Brotherhood   
3          9253                         Steins;Gate   
10         4181                Clannad: After Story   
13         2904  Code Geass: Hangyaku no Lelouch R2   
15          199       Sen to Chihiro no Kamikakushi   
...         ...                                 ...   
6396       8939                   Bohemian Rhapsody   
6971       1442                     Alexander Senki   
11118      2238                        Fuyu no Semi   
11139       719                        Ai no Kusabi   
12096       724                               Enzai   

                                                   genre   type episodes  \
1      Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64   
3                                       Sci-Fi, Thriller     TV       24   
10     Drama, Fantasy, Romance, Slice of Life, Supern...     TV       24   
13     Action, Drama, Mecha, Milita