In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
anime = pd.read_csv('../input/anime-recommendations-database/anime.csv')
anime['genre'] = anime['genre'].str.lower()

In [3]:
display(anime.head())

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"drama, romance, school, supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"action, adventure, drama, fantasy, magic, mili...",TV,64,9.26,793665
2,28977,Gintama°,"action, comedy, historical, parody, samurai, s...",TV,51,9.25,114262
3,9253,Steins;Gate,"sci-fi, thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"action, comedy, historical, parody, samurai, s...",TV,51,9.16,151266


## Data Exploration

### Best rated anime

In [4]:
display(anime.sort_values('rating', ascending=False)[['name','rating','genre','members']].head())
print(len(anime))

Unnamed: 0,name,rating,genre,members
10464,Taka no Tsume 8: Yoshida-kun no X-Files,10.0,"comedy, parody",13
10400,Spoon-hime no Swing Kitchen,9.6,"adventure, kids",47
9595,Mogura no Motoro,9.5,slice of life,62
0,Kimi no Na wa.,9.37,"drama, romance, school, supernatural",200630
9078,Kahei no Umi,9.33,historical,44


12294


### Les animés avec le plus de membres

In [5]:
a = anime.sort_values('members',ascending = False).iloc[:600]
fig = px.bar(a , x='name', y='members')
fig.show()

### Genres d'animés et leur répartition

In [6]:
import re
genre = anime['genre'].tolist()
genre = ([str(genre[i]).split() for i in range(len(genre))])
genre = [re.sub(',','',item) for sublist in genre for item in sublist]
genre_list = list(set(genre))
print(genre_list)

['action', 'yaoi', 'parody', 'josei', 'kids', 'demons', 'thriller', 'mecha', 'military', 'sci-fi', 'adventure', 'romance', 'super', 'power', 'life', 'mystery', 'martial', 'drama', 'school', 'space', 'shounen', 'magic', 'game', 'comedy', 'psychological', 'hentai', 'cars', 'music', 'samurai', 'vampire', 'arts', 'ecchi', 'of', 'sports', 'historical', 'ai', 'harem', 'nan', 'horror', 'fantasy', 'dementia', 'seinen', 'shoujo', 'yuri', 'supernatural', 'slice', 'police']


In [7]:
def add_count(line, dico):
    for genre in line:
        if genre in dico.keys():
            dico[genre] += 1
        else:
            dico[genre] = 0
        
def genre_over_anime_distrib(mylist):
    # Hashmap (key,values) -> (genre, nb of anime belonging to this genre)
    count_genres = {}
    # Counting    
    for genres in anime['genre']:
        if isinstance(genres,str):
            add_count(genres.split(', '), count_genres)
    return(count_genres)

count_genres = genre_over_anime_distrib(genre_list)



In [8]:
df_count_genre = pd.DataFrame.from_dict(count_genres, orient='index', columns=['count'])

In [9]:
fig = px.bar(df_count_genre , x=df_count_genre.index, y='count')
fig.show()

## User table

Many users don't review the shows they've watched. We could drop these records. But looking at the following distribution of ratings, we notice that most of them are positive. This means that even if the show is not rated, it is most probably a positive signal. i.e the user started to watch this show because on the paper at least, after investigating the plot of the show, it matches his tastes. Therefore, we should consider imputing unrated shows, with the median of the user



In [70]:
user = pd.read_csv('../input/anime-recommendations-database/rating.csv')
display(user[290:302])

Unnamed: 0,user_id,anime_id,rating
290,4,18153,-1
291,4,18229,-1
292,4,18507,-1
293,4,18679,-1
294,4,19769,-1
295,4,20053,-1
296,4,20431,-1
297,4,20689,-1
298,4,20931,-1
299,4,20973,-1


In [55]:
import plotly.figure_factory as ff
dfg = user.groupby('rating').count().reset_index()
fig = px.bar(dfg, x = 'rating', y='user_id')
fig.show()

In [14]:
display(user['rating'].replace({-1: np.nan}).dropna().describe())

count    6.337241e+06
mean     7.808497e+00
std      1.572496e+00
min      1.000000e+00
25%      7.000000e+00
50%      8.000000e+00
75%      9.000000e+00
max      1.000000e+01
Name: rating, dtype: float64

In [72]:
user_median = user.copy() #deep copy
user_median['rating']= user_median['rating'].replace({-1: np.nan}).dropna() #get median of ratings per user
user_median = user_median.groupby('user_id').median()['rating']
user = pd.merge(user, user_median, how = 'inner', on='user_id')
user.rename(columns={'rating_x': 'rating', 'rating_y' : 'median'}, inplace=True)
user['median'] = user['median'].replace({np.nan:-1})

user_id
1        10.0
2        10.0
3         7.0
4         NaN
5         5.0
         ... 
73512     8.5
73513     8.0
73514    10.0
73515     9.0
73516     9.0
Name: rating, Length: 73515, dtype: float64

Unnamed: 0,user_id,anime_id,rating,median
0,1,20,-1,10.0
1,1,24,-1,10.0
2,1,79,-1,10.0
3,1,226,-1,10.0
4,1,241,-1,10.0
...,...,...,...,...
7813732,73515,16512,7,9.0
7813733,73515,17187,9,9.0
7813734,73515,22145,10,9.0
7813735,73516,790,9,9.0


In [79]:
def medianise(med,rat):
    if rat == -1 and med ==-1:
        return(8)
    elif rat ==-1 and med !=1:
        return(med)
    else:
        return(rat)
    
        

In [80]:
user['rating'] = user.apply(lambda row : medianise(row['median'], row['rating']) , axis=1)


Unnamed: 0,user_id,anime_id,rating,median
0,1,20,10.0,10.0
1,1,24,10.0,10.0
2,1,79,10.0,10.0
3,1,226,10.0,10.0
4,1,241,10.0,10.0
...,...,...,...,...
7813732,73515,16512,7.0,9.0
7813733,73515,17187,9.0,9.0
7813734,73515,22145,10.0,9.0
7813735,73516,790,9.0,9.0


In [84]:
user.drop(['median'], axis=1, inplace=True)
dfg = user.groupby('rating').count().reset_index()
fig = px.bar(dfg, x = 'rating', y='user_id')
fig.show()

## Deep Learning approach

In [None]:
df_raw = pd.merge(user,anime, how='inner', on='anime_id')
df_raw.rename(columns={'rating_x': 'rating', 'rating_y' : 'rating_m'}, inplace=True)
display(df_raw.head())

In [None]:
n_users = df_raw['user_id'].nunique()
n_anime = df_raw['anime_id'].nunique()
print("There are {} users and {} anime in the dataset".format(n_users,n_anime))


In [None]:
df = df_raw.drop(['name','genre','type','episodes','members'], axis=1)
display(df.head())
print(len(df))

In [None]:
anime_popularity = df.drop(['user_id','rating_m'], axis=1).groupby("anime_id").count().rename(columns={'rating' : 'view-count'})
display(anime_popularity.head())
print(len(anime_popularity))
anime_popularity = anime_popularity.sort_values('view-count',ascending=False)

In [None]:
fig = px.line(anime_popularity.values)
fig.show()

In [None]:
non_popular_anime_ids = anime_popularity[anime_popularity['view-count']<100]
non_popular_anime_ids = non_popular_anime_ids.index.tolist()
mask = df['anime_id'].isin(non_popular_anime_ids)
df = df.loc[~mask]

In [None]:
df = df[df.rating !=-1]

In [None]:
n_users = df['user_id'].nunique()
n_anime = df['anime_id'].nunique()
print("There are {} users and {} anime in the dataset".format(n_users,n_anime))

On a plus des ids unique incrémental, donc ca va être compliqué pour le embedding, on va les refaire, ça ira plus vite. 

In [None]:
# Create a list of all unique users and anime :
unique_users_ids_list = list(np.sort(df.user_id.unique()))
unique_anime_ids_list = list(np.sort(df.anime_id.unique()))

def ids_encode(list_ids):
    dic = {}
    count = 0
    for id in list_ids:   
        dic[id] = count
        count+=1
    return(dic)

ids_encode_users = ids_encode(unique_users_ids_list)
ids_encode_anime = ids_encode(unique_anime_ids_list)

In [None]:
df['anime_id'] = df['anime_id'].map(ids_encode_anime)
df['user_id'] = df['user_id'].map(ids_encode_users)

In [None]:
print(df.head())

In [None]:
display(df.head())

In [None]:
import tensorflow as tf

In [None]:
embedding_dim = 30
n_users = 73495
n_anime = 4591
epoch = 20
learning_rate = 0.00001
batch_size = 64

In [None]:
def format_data():
    user_id = df['user_id'].to_numpy()
    anime_id = df['anime_id'].to_numpy()
    labels = df['rating'].to_numpy()
    
    #dataset_x = tf.data.Dataset.from_tensor_slices((user_id,anime_id))
    #dataset_y = tf.data.Dataset.from_tensor_slices(labels)
    #dataset = tf.data.Dataset.zip((dataset_x,dataset_y)).shuffle(buffer_size=1400, seed=101).batch(batch_size)
    #dataset = dataset.prefetch(2)
    
    return(user_id, anime_id, labels)

In [None]:
x_user, x_anime, y = format_data()

In [None]:
def build_model(embedding_dim, n_users, n_anime, learning_rate):
    input_user = tf.keras.layers.Input(shape=(1,), dtype='int32', name='input_user')
    user_embedding = tf.keras.layers.Embedding(n_users + 1, embedding_dim)(input_user)
    user_flatten = tf.keras.layers.Flatten()(user_embedding)
    
    input_anime = tf.keras.layers.Input(shape=(1,), dtype='int32', name='input_anime')
    anime_embedding = tf.keras.layers.Embedding(n_anime + 1, embedding_dim)(input_anime)
    anime_flatten = tf.keras.layers.Flatten()(anime_embedding)
    
    concat = tf.keras.layers.Concatenate(axis=-1)([user_flatten,anime_flatten])
    
    dense_1 = tf.keras.layers.Dense(units=64, activation='relu')(concat)
    dropout_1 = tf.keras.layers.Dropout(0.2)(dense_1)
    batchnorm_1= tf.keras.layers.BatchNormalization()(dropout_1)
    
    dense_2 = tf.keras.layers.Dense(units=32, activation='relu')(batchnorm_1)
    dropout_2 = tf.keras.layers.Dropout(0.2)(dense_2)
    batchnorm_2= tf.keras.layers.BatchNormalization()(dropout_2)
    
    dense_3 = tf.keras.layers.Dense(units=16, activation='relu')(batchnorm_2)
    dense_4 = tf.keras.layers.Dense(units=4, activation='relu')(dense_3)
    
    output_dense = tf.keras.layers.Dense(units=1, activation='sigmoid')(dense_4)
    output = tf.keras.layers.Lambda(lambda x: x * 10.0)(output_dense)
    
    met = tf.keras.metrics.RootMeanSquaredError(name='root_mean_squared_error', dtype=None)
    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model = tf.keras.Model(inputs=[input_user,input_anime], outputs=output)
    
    
    model.compile(loss='mean_squared_logarithmic_error', optimizer=opt, metrics=[met])
    model.summary()
    return(model)
    
    
                                                   

In [None]:
model = build_model(embedding_dim, n_users, n_anime, learning_rate)


In [None]:

model.fit(x=[x_user, x_anime], y = y ,batch_size = 64, epochs = 5)