In [11]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix

## Load the data from the Excel sheets

In [12]:
ANIME_CSV_PATH = "./datasets/anime.csv"
RATING_CSV_PATH = "./datasets/rating.csv"

In [13]:
anime_df = pd.read_csv(ANIME_CSV_PATH)
rating_df = pd.read_csv(RATING_CSV_PATH)

## Preprocess the data

In [14]:
rating_df.rating.replace({-1: np.nan}, inplace=True)

In [15]:
anime_tv_df = anime_df[anime_df['type'] == 'TV']

In [16]:
merged_df = pd.merge(anime_tv_df, rating_df, on='anime_id', suffixes=['_anime', '_user'])

In [17]:
merged_df = merged_df[['user_id', 'name', 'rating_user']]

In [18]:
merged_limit_df = merged_df[merged_df.user_id <= 20000]

create pivot table for merged data

In [19]:
user_anime_matrix = merged_limit_df.pivot_table(index=['user_id'], columns=['name'], values='rating_user')

In [None]:
norm_user_anime_matrix = user_anime_matrix.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

In [None]:
norm_user_anime_matrix.fillna(0, inplace=True)
norm_user_anime_matrix = norm_user_anime_matrix.T
norm_user_anime_matrix = norm_user_anime_matrix.loc[:, (norm_user_anime_matrix != 0).any(axis=0)]

create sparse matrix for more efficient computation

In [None]:
anime_user_csr = csr_matrix(norm_user_anime_matrix.values)

## Cosine Similarity

In [None]:
item_similarity = cosine_similarity(anime_user_csr)
user_similarity = cosine_similarity(anime_user_csr.T)

In [None]:
item_sim_df = pd.DataFrame(item_similarity, index=norm_user_anime_matrix.index, columns=norm_user_anime_matrix.index)
user_sim_df = pd.DataFrame(user_similarity, index=norm_user_anime_matrix.columns, columns=norm_user_anime_matrix.columns)

In [None]:
item_sim_df

In [None]:
user_sim_df