In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix

## Load the data from the Excel sheets

In [2]:
ANIME_CSV_PATH = "./datasets/anime.csv"
RATING_CSV_PATH = "./datasets/rating.csv"

In [3]:
anime_df = pd.read_csv(ANIME_CSV_PATH)
rating_df = pd.read_csv(RATING_CSV_PATH)

## Preprocess the data

In [4]:
rating_df.rating.replace({-1: np.nan}, inplace=True)

In [5]:
anime_tv_df = anime_df[anime_df['type'] == 'TV']

In [6]:
merged_df = pd.merge(anime_tv_df, rating_df, on='anime_id', suffixes=['_anime', '_user'])

In [7]:
merged_df = merged_df[['user_id', 'name', 'rating_user']]

In [8]:
merged_limit_df = merged_df[merged_df.user_id <= 10000]

create pivot table for merged data

In [9]:
user_anime_matrix = merged_limit_df.pivot_table(index=['user_id'], columns=['name'], values='rating_user')

In [10]:
norm_user_anime_matrix = user_anime_matrix.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

In [11]:
norm_user_anime_matrix.fillna(0, inplace=True)
norm_user_anime_matrix = norm_user_anime_matrix.T
norm_user_anime_matrix = norm_user_anime_matrix.loc[:, (norm_user_anime_matrix != 0).any(axis=0)]

create sparse matrix for more efficient computation

In [12]:
anime_user_csr = csr_matrix(norm_user_anime_matrix.values)

## Cosine Similarity

In [13]:
item_similarity = cosine_similarity(anime_user_csr)
user_similarity = cosine_similarity(anime_user_csr.T)

In [14]:
item_sim_df = pd.DataFrame(item_similarity, index=norm_user_anime_matrix.index, columns=norm_user_anime_matrix.index)
user_sim_df = pd.DataFrame(user_similarity, index=norm_user_anime_matrix.columns, columns=norm_user_anime_matrix.columns)

In [15]:
item_sim_df.head()

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.0,0.174484,0.252684,-0.005793,0.049693,0.045349,0.007642,0.0,0.03334,-0.031345,...,-0.015856,0.0,-0.022097,-0.011505,0.0,0.0,0.027313,0.025449,-0.046627,-0.04133
.hack//Sign,0.174484,1.0,0.159591,0.020331,0.052251,0.028638,-0.002524,0.0,0.035391,-0.035378,...,-0.00776,0.0,-0.011179,-0.015013,-0.009433,-0.016342,-0.010947,0.025278,-0.008932,-0.03574
.hack//Tasogare no Udewa Densetsu,0.252684,0.159591,1.0,0.046787,0.026241,0.039369,-0.009706,0.0,0.001175,0.004135,...,0.009735,0.0,-0.021021,-0.02429,0.0,0.0,0.007509,0.020202,-0.04699,-0.028177
009-1,-0.005793,0.020331,0.046787,1.0,0.000486,0.011115,0.06593,0.0,0.057343,-0.074427,...,0.007241,0.0,0.003136,0.007257,0.0,0.0,0.0,-0.032581,0.007404,0.015191
07-Ghost,0.049693,0.052251,0.026241,0.000486,1.0,0.083305,0.009228,-0.009119,0.066026,-0.027582,...,0.007254,0.0,-0.032382,-0.033779,6e-05,4.4e-05,0.016763,0.017641,-0.032393,-0.035078


In [16]:
user_sim_df.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,0.063993,0.167627,0.020365,0.125977,0.04523309,0.108629,0.005399,0.077388,0.248132,...,-0.109099,-0.145602,-0.017873,0.043,0.054199,0.027335,-0.015971,0.075051,0.041128,0.068404
5,0.063993,1.0,0.072457,0.016326,0.031464,0.005346814,0.094016,0.106924,0.007067,0.13026,...,0.004349,-0.05357,0.04424,0.01279,0.103616,0.168159,-0.014871,0.07504,0.022199,0.008838
7,0.167627,0.072457,1.0,-0.013548,0.020401,-0.01515745,0.061591,0.122852,0.048526,0.165473,...,0.0,0.0,0.000631,0.02043,0.068743,0.091902,0.019716,0.12744,0.05094,-0.013605
8,0.020365,0.016326,-0.013548,1.0,-0.052705,0.01892828,0.002786,-0.055035,0.0,0.001459,...,-0.228218,0.0,0.019007,0.043206,0.035957,-0.036623,0.0,-0.041096,-0.018445,0.106221
10,0.125977,0.031464,0.020401,-0.052705,1.0,-1.665335e-16,0.008811,0.044415,0.0,0.071511,...,0.0,0.0,0.0,0.208672,-0.015703,0.040062,0.0,0.259914,0.058329,-0.050271


In [17]:
def get_rating_by_user_anime(user_id, anime_name):
    watched_anime = user_anime_matrix.loc[user_id]
    watched_anime = pd.DataFrame({
        'name': watched_anime.index,
        'rating': watched_anime.values
    })
    watched_anime.fillna(0, inplace=True)
    
    return watched_anime[watched_anime['name'] == anime_name]['rating'].values[0]

In [18]:
def get_watched_rating_mean_by_user(user_id):
    watched_anime = user_anime_matrix.loc[user_id]
    watched_anime = pd.DataFrame({
        'name': watched_anime.index,
        'rating': watched_anime.values
    })
    watched_anime.dropna(inplace=True)
    
    mean = watched_anime['rating'].sum() / watched_anime['rating'].size
    
    return mean

In [19]:
def pred_user_based(user_id, anime_name):
    watched_anime = user_anime_matrix.loc[user_id]
    watched_anime = pd.DataFrame({
        'name': watched_anime.index,
        'rating': watched_anime.values
    })
    watched_anime.dropna(inplace=True)
    r_mean = get_watched_rating_mean_by_user(user_id)
    
    sim_users = user_sim_df.sort_values(by=user_id, ascending=False)[user_id][1:11]
    sim_users = pd.DataFrame({
        'user_id': sim_users.index,
        'sim': sim_users.values
    })
    sum_sim_users = sim_users['sim'].sum()
    
    sum_mean_users = 0
    for idx, row in sim_users.iterrows():
        mean_user = get_watched_rating_mean_by_user(row['user_id'])
        sum_mean_users += row['sim'] * (get_rating_by_user_anime(row['user_id'], anime_name) - mean_user)
    
    return r_mean + (sum_mean_users/sum_sim_users)

In [20]:
user_id = user_sim_df.sample(1).index.values[0]

In [21]:
anime_name = item_sim_df.sample(1).index.values[0]

In [22]:
pred_user_based(user_id, anime_name)

-0.10799773308745486

In [23]:
item_sim_df.sort_values(by=anime_name, ascending=False)[anime_name][1:11]

name
Ninku                        0.192487
Hipira-kun                   0.178156
Ninja Senshi Tobikage        0.137912
Wild Arms: Twilight Venom    0.134218
El Cazador de la Bruja       0.132821
Yuugo: Koushounin            0.132298
Majin Bone                   0.131252
Tears to Tiara               0.126493
Genji Monogatari Sennenki    0.123803
Elf wo Karu Mono-tachi       0.120004
Name: Arad Senki: Slap Up Party, dtype: float64