In [347]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [348]:
df = pd.read_csv('/Users/shuai/Desktop/animelist_reduced.csv')
df.head()


Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,15,223,10,2,153
1,15,225,5,2,64
2,15,502,10,2,1
3,15,72,7,2,12
4,15,594,9,2,1


In [349]:
grouped_df = df.groupby('user_id').agg(
    anime_watched_amount=pd.NamedAgg(column='anime_id', aggfunc='count')
)

# remove users who have watched less than 10 anime
less_than_10 = grouped_df[grouped_df['anime_watched_amount'] <= 10].index


In [350]:
df_filtered = df[~df['user_id'].isin(less_than_10)]

In [353]:
grouped_df = df.groupby('anime_id').agg(
    anime_watched_amount=pd.NamedAgg(column='user_id', aggfunc='count')
)

In [354]:
df2 = pd.read_csv('/Users/shuai/Downloads/anime_with_synopsis.csv')
df2.rename(columns={'MAL_ID': 'anime_id'}, inplace=True)

In [355]:
df2 = df2[['anime_id', 'Name', 'Genres']]

In [356]:
df2.head()

Unnamed: 0,anime_id,Name,Genres
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space"
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural"


In [357]:
merged_df = pd.merge(df_filtered, df2[['anime_id', 'Genres']], on='anime_id', how='left')


ratings = merged_df[['user_id', 'anime_id', 'Genres', 'rating']]
ratings.head()

Unnamed: 0,user_id,anime_id,Genres,rating
0,34,13767,"Comedy, Drama, Romance, Shounen Ai",9
1,34,1735,"Action, Adventure, Comedy, Super Power, Martia...",10
2,34,9624,"Ecchi, Parody, Romance, Seinen",6
3,34,53,"Harem, Slice of Life, Comedy, Drama, Romance",7
4,34,5112,"Comedy, Ecchi, Fantasy, Romance",7


In [358]:
ratings['Genres'] = ratings['Genres'].fillna('Hentai')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['Genres'] = ratings['Genres'].fillna('Hentai')


In [359]:
df_unique_anime_id = ratings.drop_duplicates(subset='anime_id', keep='first')


In [360]:
df_unique_anime_id = df_unique_anime_id[['anime_id', 'Genres']]

In [362]:
vec =CountVectorizer( )
genres_vec = vec.fit_transform(df_unique_anime_id['Genres'])

# Display resulting feature vectors
genres_vectorized = pd.DataFrame(genres_vec.todense(),columns=vec.get_feature_names_out(),index=df_unique_anime_id.anime_id)
genres_vectorized.head()

Unnamed: 0_level_0,action,adventure,ai,arts,cars,comedy,dementia,demons,drama,ecchi,...,shounen,slice,space,sports,super,supernatural,thriller,unknown,vampire,yaoi
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13767,0,0,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1735,1,1,0,1,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
9624,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
53,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
5112,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [363]:
# Build similarity marrix of movies based on similarity of genres
csmatrix = cosine_similarity(genres_vec)
csmatrix = pd.DataFrame(csmatrix,columns=df_unique_anime_id.anime_id,index=df_unique_anime_id.anime_id)
csmatrix.head()

anime_id,13767,1735,9624,53,5112,6287,8676,9925,15085,6547,...,41401,20083,37058,31317,3778,34502,38562,22053,2280,27491
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13767,1.0,0.316228,0.223607,0.507093,0.447214,0.316228,0.365148,0.0,0.2,0.4,...,0.316228,0.258199,0.0,0.447214,0.223607,0.223607,0.223607,0.223607,0.316228,0.258199
1735,0.316228,1.0,0.0,0.133631,0.176777,0.25,0.144338,0.0,0.0,0.316228,...,0.5,0.204124,0.353553,0.353553,0.176777,0.353553,0.176777,0.176777,0.25,0.204124
9624,0.223607,0.0,1.0,0.188982,0.5,0.353553,0.204124,0.0,0.223607,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0
53,0.507093,0.133631,0.188982,1.0,0.377964,0.267261,0.771517,0.0,0.338062,0.338062,...,0.267261,0.218218,0.0,0.377964,0.188982,0.188982,0.755929,0.188982,0.267261,0.218218
5112,0.447214,0.176777,0.5,0.377964,1.0,0.707107,0.408248,0.0,0.447214,0.223607,...,0.353553,0.288675,0.0,0.5,0.25,0.25,0.25,0.25,0.353553,0.57735


In [364]:
X = ratings.drop(labels=['rating','Genres'],axis=1)
y = ratings['rating']
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)



In [365]:
def predict_rating(user_item_pair,simtable=csmatrix,X_train=X_train, y_train=y_train):
    anime_to_rate = user_item_pair['anime_id']
    user = user_item_pair['user_id']
    # Filter similarity matrix to only movies already reviewed by user
    anime_watched = X_train.loc[X_train['user_id']==user, 'anime_id'].tolist()
    simtable_filtered = simtable.loc[anime_to_rate,anime_watched]
    # Get the most similar movie already watched to current movie to rate
    most_similar_watched = simtable_filtered.index[np.argmax(simtable_filtered)]
    # Get user's rating for most similar movie
    idx = X_train.loc[(X_train['user_id']==user) & (X_train['anime_id']==most_similar_watched)].index.values[0]
    most_similar_rating = y_train.loc[idx]
    return most_similar_rating

In [366]:
# Get the predicted ratings for each movie in the validation set and calculate the RMSE
ratings_valset = X_val.apply(lambda x: predict_rating(x),axis=1)
val_rmse = np.sqrt(mean_squared_error(y_val,ratings_valset))
print('RMSE of predicted ratings is {:.3f}'.format(val_rmse))

RMSE of predicted ratings is 1.797


In [388]:
df2.head()

Unnamed: 0,anime_id,Name,Genres
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space"
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural"


In [396]:
df2.to_csv('/Users/shuai/Desktop/anime_mapping.csv', index=False)

In [394]:
def generate_recommendations(user,simtable,ratings):
    # Get top rated movie by user
    user_ratings = ratings.loc[ratings['user_id']==user]
    user_ratings = user_ratings.sort_values(by='rating',axis=0,ascending=False)
    topratedanime = user_ratings.iloc[0,:]['anime_id']
    topratedanime_title = df2.loc[df2['anime_id']==topratedanime,'Name'].values[0]
    # Find most similar movies to the user's top rated movie
    sims = simtable.loc[topratedanime,:]
    mostsimilar = sims.sort_values(ascending=False).index.values
    # Get 10 most similar movies excluding the movie itself
    mostsimilar = mostsimilar[1:11]
    # Get titles of movies from ids
    mostsimanime_names = []
    for anime in mostsimilar:
        mostsimanime_names.append(df2.loc[df2['anime_id']==anime,'Name'].values[0])
    return topratedanime_title, mostsimanime_names

In [390]:
ratings.head()

Unnamed: 0,user_id,anime_id,Genres,rating
0,34,13767,"Comedy, Drama, Romance, Shounen Ai",9
1,34,1735,"Action, Adventure, Comedy, Super Power, Martia...",10
2,34,9624,"Ecchi, Parody, Romance, Seinen",6
3,34,53,"Harem, Slice of Life, Comedy, Drama, Romance",7
4,34,5112,"Comedy, Ecchi, Fantasy, Romance",7


In [395]:
user = 34
topratedmovie, recs = generate_recommendations(user,simtable=csmatrix,ratings=ratings)
print("User's highest rated movie was {}".format(topratedmovie))
for i,rec in enumerate(recs):
  print('Recommendation {}: {}'.format(i,rec))

User's highest rated movie was Ranma ½
Recommendation 0: Ninja Hattori-kun
Recommendation 1: Ranma ½: Tendou-ke no Oyobidenai Yatsura!
Recommendation 2: Ranma ½: 1994 Music Calendar
Recommendation 3: Ranma ½: Battle ga Ippai 29-nin no Korinai Yatsura
Recommendation 4: Namiuchigiwa no Muromi-san
Recommendation 5: Ranma ½: Totteoki Talk - Best of Memories
Recommendation 6: Namiuchigiwa no Muromi-san: Pangea Chou Tairiku no Muromi-san
Recommendation 7: Dragon Ball Z: Zenbu Misemasu Toshi Wasure Dragon Ball Z!
Recommendation 8: Ranma ½: DoCo Music Video
Recommendation 9: Yawara!: Sore Yuke Koshinuke Kids!!
