In [32]:
import pandas as pd
import numpy as np
from math import sqrt

In [33]:
anime_df = pd.read_csv('archive/anime.csv')
rating_df = pd.read_csv('archive/rating.csv')
print(anime_df.info())
anime_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [51]:
userInput = [{'name':'Shingeki no Kyojin', 'rating':10},
             {'name':'Tokyo Ghoul √A', 'rating':2},
             {'name':'Pokemon XY', 'rating':2},
             {'name':'Kimi no Na wa.', 'rating':10},
             {'name':'Mushishi', 'rating':9}]
inputAnime = pd.DataFrame(userInput)
print(inputAnime)

                 name  rating
0  Shingeki no Kyojin      10
1      Tokyo Ghoul √A       2
2          Pokemon XY       2
3      Kimi no Na wa.      10
4            Mushishi       9


In [52]:
inputAnime['name'] = inputAnime['name'].str.strip()
anime_df['name'] = anime_df['name'].str.strip()

# merge df
result_df = pd.merge(anime_df[['anime_id', 'name']], inputAnime, on='name')

# column selection
result_df = result_df[['anime_id', 'name', 'rating']].rename(columns={'anime_id': 'anime_id'})

print("Final DataFrame:")
print(result_df)

Final DataFrame:
   anime_id                name  rating
0     32281      Kimi no Na wa.      10
1       457            Mushishi       9
2     16498  Shingeki no Kyojin      10
3     19291          Pokemon XY       2
4     27899      Tokyo Ghoul √A       2


In [53]:
# merge rating dataframe with result dataframe on anime id
merged_ratings = pd.merge(rating_df, result_df, on='anime_id')

# group by anime id and count  occurrences
similar_taste_counts = merged_ratings.groupby('anime_id').size().reset_index(name='count')

print(similar_taste_counts)

   anime_id  count
0       457   7004
1     16498  29584
2     19291    364
3     27899  11549
4     32281   2199


In [75]:
# merge with rating_df to get actual ratings
merged_ratings = pd.merge(similar_taste_counts, rating_df, on='anime_id')

# filter out rows with negative ratings
merged_ratings = merged_ratings[merged_ratings['rating'] >= 0]

# group by 'user_id'
userSubsetGroup = merged_ratings.groupby('user_id')

# define function to get the count of anime_ids each user has in common with the input
def count_common_anime(x):
    return len(x)

# sort users by the count of common anime_ids in descending order
userSubsetGroup = sorted(userSubsetGroup, key=count_common_anime, reverse=True)

# select the top 100 users
userSubsetGroup = userSubsetGroup[0:100]

# print the userSubsetGroup
for group in userSubsetGroup:
    user_id, user_data = group
    print(f"User ID: {user_id}")
    
    # display anime info
    for _, row in user_data.iterrows():
        print(f"  Anime ID: {row['anime_id']}, Count: {row['count']}, Rating: {row['rating']}")
    
    print()  


User ID: 3
  Anime ID: 16498, Count: 29584, Rating: 10
  Anime ID: 27899, Count: 11549, Rating: 8

User ID: 5
  Anime ID: 27899, Count: 11549, Rating: 2

User ID: 11
  Anime ID: 16498, Count: 29584, Rating: 9

User ID: 12
  Anime ID: 16498, Count: 29584, Rating: 10

User ID: 14
  Anime ID: 16498, Count: 29584, Rating: 8

User ID: 17
  Anime ID: 16498, Count: 29584, Rating: 9
  Anime ID: 27899, Count: 11549, Rating: 4

User ID: 18
  Anime ID: 27899, Count: 11549, Rating: 7

User ID: 21
  Anime ID: 457, Count: 7004, Rating: 10
  Anime ID: 16498, Count: 29584, Rating: 8

User ID: 22
  Anime ID: 457, Count: 7004, Rating: 10

User ID: 24
  Anime ID: 16498, Count: 29584, Rating: 8

User ID: 25
  Anime ID: 16498, Count: 29584, Rating: 9

User ID: 27
  Anime ID: 16498, Count: 29584, Rating: 9

User ID: 29
  Anime ID: 16498, Count: 29584, Rating: 8

User ID: 30
  Anime ID: 16498, Count: 29584, Rating: 9
  Anime ID: 27899, Count: 11549, Rating: 9

User ID: 33
  Anime ID: 16498, Count: 29584, Rat

In [76]:
# error handling, printing column names
print(inputAnime.columns)


Index(['name', 'rating'], dtype='object')


In [79]:
# Storing Pearson Correlation in a distionary.
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:
    
    # sort input and current user group to avoid complications
    group = group.sort_values(by='anime_id')
    result_df = result_df.sort_values(by='anime_id')
    
    # get n for formula
    
    nRatings = len(group)
    
    # get review scores for movies in common
    temp_df = result_df[result_df['anime_id'].isin(group['anime_id'].tolist())]
    
    #put current user group review in list
    tempGroupList = group['rating'].tolist()
    
    #temporary buffer var
    tempRatingList = temp_df['rating'].tolist()
    
    # calc pearson correlation
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    # if  denominator is different than zero then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [80]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  user_id
0              1.0        3
1              0.0        5
2              0.0       11
3              0.0       12
4              0.0       14


In [81]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  user_id
0               1.0        3
16              1.0       38
94              1.0      228
86              1.0      211
85              1.0      210


In [82]:
print("Columns in topUsers:", topUsers.columns)

Columns in topUsers: Index(['similarityIndex', 'user_id'], dtype='object')


In [83]:

topUsersRating=topUsers.merge(rating_df, left_on='user_id', right_on='user_id', how='inner')
print(topUsersRating.head(100))

    similarityIndex  user_id  anime_id  rating
0               1.0        3        20       8
1               1.0        3       154       6
2               1.0        3       170       9
3               1.0        3       199      10
4               1.0        3       225       9
..              ...      ...       ...     ...
95              1.0       38        30      10
96              1.0       38        32      10
97              1.0       38        47       7
98              1.0       38        71       3
99              1.0       38        72       1

[100 rows x 4 columns]


In [84]:
# multiply similarity of user rating
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  user_id  anime_id  rating  weightedRating
0              1.0        3        20       8             8.0
1              1.0        3       154       6             6.0
2              1.0        3       170       9             9.0
3              1.0        3       199      10            10.0
4              1.0        3       225       9             9.0


In [86]:
# group by anime id and sum
tempTopUsersRating = topUsersRating.groupby('anime_id').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

          sum_similarityIndex  sum_weightedRating
anime_id                                         
1                    5.049861           49.498605
5                    1.453821           13.084389
6                    3.000000           24.000000
7                    1.000000            9.000000
15                   1.000000           10.000000


In [87]:
# new dataframe
recommendation_df = pd.DataFrame()

# weighted advantage
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['anime_id'] = tempTopUsersRating.index
print(recommendation_df.head(10))

          weighted average recommendation score  anime_id
anime_id                                                 
1                                      9.801975         1
5                                      9.000000         5
6                                      8.000000         6
7                                      9.000000         7
15                                    10.000000        15
16                                     9.000000        16
18                                     9.000000        18
19                                    10.000000        19
20                                     7.850041        20
22                                     7.000000        22


In [88]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

          weighted average recommendation score  anime_id
anime_id                                                 
19363                                      10.0     19363
28735                                      10.0     28735
10681                                      10.0     10681
15335                                      10.0     15335
13833                                      10.0     13833
...                                         ...       ...
32608                                       NaN     32608
33046                                       NaN     33046
33161                                       NaN     33161
33221                                       NaN     33221
33558                                       NaN     33558

[2040 rows x 2 columns]


In [89]:
recommended_anime=anime_df.loc[anime_df['anime_id'].isin(recommendation_df['anime_id'])]

# ensures movie is unique
recommended_anime=recommended_anime.loc[~recommended_anime.anime_id.isin(merged_ratings['anime_id'])]

print(recommended_anime)

       anime_id                                               name  \
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama°   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
5         32935  Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...   
...         ...                                                ...   
12174      2349               Bondage Game: Shinsou no Reijoutachi   
12193      2592                                             Kimera   
12209      5391                                      Pico to Chico   
12213      4866                                Pico x CoCo x Chico   
12217      1639                                       Boku no Pico   

                                                   genre type episodes  \
1      Action, Adventure, Drama, Fantasy, Magic, Mili...   TV       64   
2      Acti