In [257]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [258]:
df = pd.read_csv('/Users/shuai/Downloads/animelist_reduced.csv')
df.head()


Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,47,36793,0,1,5
1,47,41433,0,1,0
2,47,38753,8,1,6
3,47,40128,0,1,0
4,47,14669,0,1,0


In [259]:
grouped_df = df.groupby('user_id').agg(
    anime_watched_amount=pd.NamedAgg(column='anime_id', aggfunc='count')
)

less_than_10 = grouped_df[grouped_df['anime_watched_amount'] <= 10].index
# remove users who have watched less than 10 anime

# 


In [262]:
df_filtered = df[~df['user_id'].isin(less_than_10)]

In [282]:
grouped_df = df.groupby('anime_id').agg(
    anime_watched_amount=pd.NamedAgg(column='user_id', aggfunc='count')
)

In [264]:
df2 = pd.read_csv('/Users/shuai/Downloads/anime_with_synopsis.csv')
df2.rename(columns={'MAL_ID': 'anime_id'}, inplace=True)

In [265]:
df2 = df2[['anime_id', 'Name', 'Genres']]

In [266]:
df2.head()

Unnamed: 0,anime_id,Name,Genres
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space"
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural"


In [269]:
merged_df = pd.merge(df_filtered, df2[['anime_id', 'Genres']], on='anime_id', how='left')


ratings = merged_df[['user_id', 'anime_id', 'Genres', 'rating']]
ratings.head()

Unnamed: 0,user_id,anime_id,Genres,rating
0,47,36793,"Romance, School, Shoujo",0
1,47,41433,"Action, Sci-Fi",0
2,47,38753,"Comedy, Drama, Romance, School, Shounen",8
3,47,40128,"Drama, Historical, Romance, Seinen, Slice of Life",0
4,47,14669,"Supernatural, Drama, Romance, School",0


In [270]:
ratings['Genres'] = ratings['Genres'].fillna('Hentai')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['Genres'] = ratings['Genres'].fillna('Hentai')


In [300]:
df_unique_anime_id = ratings.drop_duplicates(subset='anime_id', keep='first')


In [303]:
df_unique_anime_id = df_unique_anime_id[['anime_id', 'Genres']]

In [273]:
ratings.groupby('user_id').count()

Unnamed: 0_level_0,anime_id,Genres,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47,862,862,862
68,96,96,96
205,983,983,983
239,50,50,50
407,226,226,226
...,...,...,...
353231,254,254,254
353274,131,131,131
353350,164,164,164
353381,389,389,389


In [306]:
vec = CountVectorizer()
genres_vec = vec.fit_transform(df_unique_anime_id['Genres'])

# Display resulting feature vectors
genres_vectorized = pd.DataFrame(genres_vec.todense(),columns=vec.get_feature_names_out(),index=df_unique_anime_id.anime_id)
genres_vectorized.head()

Unnamed: 0_level_0,action,adventure,ai,arts,cars,comedy,dementia,demons,drama,ecchi,...,shounen,slice,space,sports,super,supernatural,thriller,unknown,vampire,yaoi
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36793,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41433,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38753,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
40128,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
14669,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [309]:
# Build similarity marrix of movies based on similarity of genres
csmatrix = cosine_similarity(genres_vec)
csmatrix = pd.DataFrame(csmatrix,columns=df_unique_anime_id.anime_id,index=df_unique_anime_id.anime_id)
csmatrix.head()

anime_id,36793,41433,38753,40128,14669,38328,2251,9513,34572,11633,...,42331,44848,40437,42641,42485,42247,42246,45616,48239,48171
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36793,1.0,0.0,0.516398,0.218218,0.57735,0.0,0.0,0.235702,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41433,0.0,1.0,0.0,0.0,0.0,0.866025,0.258199,0.235702,0.258199,0.235702,...,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
38753,0.516398,0.0,1.0,0.338062,0.67082,0.0,0.2,0.547723,0.4,0.182574,...,0.223607,0.0,0.0,0.0,0.258199,0.0,0.0,0.0,0.0,0.0
40128,0.218218,0.0,0.338062,1.0,0.377964,0.0,0.169031,0.0,0.0,0.154303,...,0.566947,0.0,0.46291,0.0,0.218218,0.0,0.0,0.0,0.0,0.0
14669,0.57735,0.0,0.67082,0.377964,1.0,0.0,0.223607,0.408248,0.0,0.204124,...,0.0,0.0,0.0,0.0,0.288675,0.0,0.0,0.0,0.0,0.0


In [310]:
X = ratings.drop(labels=['rating','Genres'],axis=1)
y = ratings['rating']
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)



In [313]:
def predict_rating(user_item_pair,simtable=csmatrix,X_train=X_train, y_train=y_train):
    anime_to_rate = user_item_pair['anime_id']
    user = user_item_pair['user_id']
    # Filter similarity matrix to only movies already reviewed by user
    anime_watched = X_train.loc[X_train['user_id']==user, 'anime_id'].tolist()
    simtable_filtered = simtable.loc[anime_to_rate,anime_watched]
    # Get the most similar movie already watched to current movie to rate
    most_similar_watched = simtable_filtered.index[np.argmax(simtable_filtered)]
    # Get user's rating for most similar movie
    idx = X_train.loc[(X_train['user_id']==user) & (X_train['anime_id']==most_similar_watched)].index.values[0]
    most_similar_rating = y_train.loc[idx]
    return most_similar_rating

In [314]:
# Get the predicted ratings for each movie in the validation set and calculate the RMSE
ratings_valset = X_val.apply(lambda x: predict_rating(x),axis=1)
val_rmse = np.sqrt(mean_squared_error(y_val,ratings_valset))
print('RMSE of predicted ratings is {:.3f}'.format(val_rmse))

RMSE of predicted ratings is 4.168


In [None]:
def generate_recommendations(user,simtable,ratings):
    # Get top rated movie by user
    user_ratings = df.loc[ratings['user_id']==user]
    user_ratings = user_ratings.sort_values(by='rating',axis=0,ascending=False)
    topratedanime = user_ratings.iloc[0,:]['anime_id']
    topratedanime_title = movies.loc[movies['anime_id']==topratedanime,'title'].values[0]
    # Find most similar movies to the user's top rated movie
    sims = simtable.loc[topratedanime,:]
    mostsimilar = sims.sort_values(ascending=False).index.values
    # Get 10 most similar movies excluding the movie itself
    mostsimilar = mostsimilar[1:11]
    # Get titles of movies from ids
    mostsimmovies_names = []
    for m in mostsimilar:
        mostsimmovies_names.append(movies.loc[movies['movieId']==m,'title'].values[0])
    return topratedanime_title, mostsimmovies_names

Unnamed: 0,user_id,anime_id
1121973,226361,2963
1711670,351393,34577
1192979,241705,5
1583428,319084,437
1422152,285700,37869
...,...,...
152315,32246,34820
963395,199459,32
117952,25380,12967
1484405,298085,2403
