In [0]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Utils

In [3]:
import keras, math
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
from pathlib import Path

def reset_weights(model):
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'): layer.kernel.initializer.run(session=session)
        if hasattr(layer, 'bias'):
            if layer.bias and hasattr(layer, 'bias_initializer'): layer.bias.initializer.run(session=session)



Using TensorFlow backend.


#Importamos la data y la procesamos

In [0]:
import pandas as pd
ratings = pd.read_csv("/content/drive/My Drive/ML/myanimelist/animelists_reduced.csv")

In [0]:
users= pd.read_csv("/content/drive/My Drive/ML/myanimelist/users_reduced.csv")

In [0]:
animes= pd.read_csv("/content/drive/My Drive/ML/myanimelist/animes_reduced.csv")

In [7]:
# Nos quedamos solo con su username y id
users.drop(columns = ['user_watching','user_completed','user_onhold','user_dropped','user_plantowatch','user_days_spent_watching','gender','location',
                      'birth_date','access_rank','join_date','last_online','stats_mean_score','stats_rewatched','stats_episodes'], inplace = True)
users.head()  

Unnamed: 0,username,user_id
0,karthiga,2255153
1,Damonashu,37326
2,bskai,228342
3,terune_uzumaki,327311
4,Bas_G,5015094


In [8]:
#Nos quedamos con los atributos que nos interesan de cada Data Frame
ratings.drop(columns=['my_watched_episodes', 'my_start_date', 'my_finish_date', 'my_status', 'my_rewatching', 'my_rewatching_ep', 'my_last_updated', 'my_tags'], inplace = True)
ratings.head()

Unnamed: 0,username,anime_id,my_score
0,karthiga,21,9
1,karthiga,59,7
2,karthiga,74,7
3,karthiga,120,7
4,karthiga,178,7


In [9]:
# Nos quedamos con los siguiente atributos en anime
# anime_id	title	type	source	episodes	score	scored_by genre
print(len(animes))
animes = animes.drop(columns = ['title_english', 'title_japanese', 'title_synonyms', 'image_url', 'status', 'airing', 'aired_string', 'aired', 'duration', 'rating', 'rank','popularity', 'members']) 
animes = animes.drop(columns = ['favorites', 'background', 'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio', 'opening_theme', 'ending_theme', 'duration_min', 'aired_from_year'])

#animes.head()


5768


In [10]:
# Filtramos los animes que solo son de TV
anime_tv = animes[animes['type']=='TV']
anime_tv.head()

Unnamed: 0,anime_id,title,type,source,episodes,score,scored_by,genre
0,11013,Inu x Boku SS,TV,Manga,12,7.63,139250,"Comedy, Supernatural, Romance, Shounen"
1,2104,Seto no Hanayome,TV,Manga,26,7.89,91206,"Comedy, Parody, Romance, School, Shounen"
2,5262,Shugo Chara!! Doki,TV,Manga,51,7.55,37129,"Comedy, Magic, School, Shoujo"
3,721,Princess Tutu,TV,Original,38,8.21,36501,"Comedy, Drama, Magic, Romance, Fantasy"
4,12365,Bakuman. 3rd Season,TV,Manga,25,8.67,107767,"Comedy, Drama, Romance, Shounen"


In [11]:
# Solo cambiamos el nombre de la columna my_score por comodidad
ratings.rename(columns= {'my_score' : 'score'}, inplace = True)
ratings.head()

Unnamed: 0,username,anime_id,score
0,karthiga,21,9
1,karthiga,59,7
2,karthiga,74,7
3,karthiga,120,7
4,karthiga,178,7


In [12]:
# Unimos los dos Data Frames
merged = ratings.merge(anime_tv, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.head()

Unnamed: 0,username,anime_id,score_user,title,type,source,episodes,score,scored_by,genre
0,karthiga,21,9,One Piece,TV,Manga,0,8.54,423868,"Action, Adventure, Comedy, Super Power, Drama,..."
1,Damonashu,21,10,One Piece,TV,Manga,0,8.54,423868,"Action, Adventure, Comedy, Super Power, Drama,..."
2,bskai,21,8,One Piece,TV,Manga,0,8.54,423868,"Action, Adventure, Comedy, Super Power, Drama,..."
3,Slimak,21,10,One Piece,TV,Manga,0,8.54,423868,"Action, Adventure, Comedy, Super Power, Drama,..."
4,MistButterfly,21,0,One Piece,TV,Manga,0,8.54,423868,"Action, Adventure, Comedy, Super Power, Drama,..."


In [13]:
# Nos quedamos con 4*10^4 usuarios para agilizar los calculos
merged = merged[['username', 'title', 'anime_id','score_user']]
merged = merged.merge(users, left_on = 'username', right_on = 'username')
merged.head()

Unnamed: 0,username,title,anime_id,score_user,user_id
0,karthiga,One Piece,21,9,2255153
1,karthiga,Chobits,59,7,2255153
2,karthiga,Gakuen Alice,74,7,2255153
3,karthiga,Fruits Basket,120,7,2255153
4,karthiga,Ultra Maniac,178,7,2255153


In [15]:

# Experimentalmente con 10^4 datos no da RTE
data = merged.copy()
data.head()

Unnamed: 0,username,title,anime_id,score_user,user_id
0,karthiga,One Piece,21,9,2255153
1,karthiga,Chobits,59,7,2255153
2,karthiga,Gakuen Alice,74,7,2255153
3,karthiga,Fruits Basket,120,7,2255153
4,karthiga,Ultra Maniac,178,7,2255153


In [16]:
data.shape

(111837, 5)

In [17]:
data = data[data.score_user != 0] #Los 0 indican que un anime no fue calificado, por lo que procedemos a eliminarlos
data.shape

(69329, 5)

In [0]:
data.user_id = data.user_id.astype('category').cat.codes.values
data.anime_id = data.anime_id.astype('category').cat.codes.values

In [0]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

In [20]:
print("cantidad de usuarios en data: ",len(data.user_id.unique()))
print("cantidad de usuarios en train: ",len(train.user_id.unique()))
print("cantidad de usuarios en test: ",len(test.user_id.unique()))

print("cantidad de animes en data: ",len(data.anime_id.unique()))
print("cantidad de animes en train: ",len(train.anime_id.unique()))
print("cantidad de animes en test: ",len(test.anime_id.unique()))

cantidad de usuarios en data:  495
cantidad de usuarios en train:  491
cantidad de usuarios en test:  479
cantidad de animes en data:  2462
cantidad de animes en train:  2413
cantidad de animes en test:  2016


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras import Model
from keras.layers import Input, Flatten, Embedding, Dot, Dropout, Concatenate, Dense, BatchNormalization
from IPython.display import SVG
from keras.optimizers import Adam
from keras.utils.vis_utils import model_to_dot
n_users, n_animes = len(data.user_id.unique()),len(data.anime_id.unique())
n_latent_factors = 3
print(n_animes)

2462


#Neural Network

In [0]:
n_latent_factors_user = 75
n_latent_factors_anime = 75

anime_input = Input(shape=[1],name='Item')
anime_embedding = Embedding(n_animes + 1, n_latent_factors_anime, name='Anime-Embedding')(anime_input)
anime_vec = Flatten(name='FlattenAnimes')(anime_embedding)
anime_vec = Dropout(0.2)(anime_vec)

user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding')(user_input))
user_vec = Dropout(0.2)(user_vec)

concat = Concatenate()([anime_vec, user_vec])
concat_dropout = Dropout(0.2)(concat)
dense = Dense(200,name='FullyConnected')(concat_dropout)
#normal1= BatchNormalization()(dense)
dropout_1 = Dropout(0.2,name='Dropout1')(dense)
dense_2 = Dense(100,name='FullyConnected-1')(dropout_1)
#normal2= BatchNormalization()(dense_2)
dropout_2 = Dropout(0.2,name='Dropout2')(dense_2)
dense_3 = Dense(50,name='FullyConnected-2')(dropout_2)
#normal3= BatchNormalization()(dense_3)
dropout_3 = Dropout(0.2,name='Dropout3')(dense_3)
dense_4 = Dense(20,name='FullyConnected-3',activation='relu')(dropout_3)
#normal4= BatchNormalization()(dense_4)
#dropout_4 = Dropout(0.5,name='Dropout4')(dense_4)

result = Dense(1, activation='linear',name='Activation')(dense_4)
adam = Adam(lr=0.001)
model = Model([anime_input, user_input], result)


In [30]:
#DESCOMENTAR ESTA LINEA SI SE QUIERE USAR PESOS GUARDADOS ANTERIORMENTE
#model.load_weights("weights.best.hdf5")
model.compile(optimizer=adam,loss= 'mean_absolute_error')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Anime-Embedding (Embedding)     (None, 1, 75)        184725      Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 75)        37200       User[0][0]                       
__________________________________________________________________________________________________
FlattenAni

In [0]:
from keras.callbacks import ModelCheckpoint
filepath="weights2.best.hdf5"
checkpoint= ModelCheckpoint(filepath,monitor="val_loss",verbose=1,save_best_only=True, mode="min")



In [0]:
reset_weights(model)
history = model.fit([train.anime_id, train.user_id], train.score_user, epochs=20,callbacks=[checkpoint], verbose=1, validation_split=0.2)


Train on 44370 samples, validate on 11093 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.08638, saving model to weights2.best.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 1.08638 to 1.07616, saving model to weights2.best.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 1.07616 to 1.03165, saving model to weights2.best.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 1.03165 to 1.02411, saving model to weights2.best.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 1.02411 to 1.02314, saving model to weights2.best.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 1.02314 to 1.01557, saving model to weights2.best.hdf5
Epoch 7/20

Epoch 00007: val_loss improved from 1.01557 to 1.00841, saving model to weights2.best.hdf5
Epoch 8/20

Epoch 00008: val_loss did not improve from 1.00841
Epoch 9/20

Epoch 00009: val_loss did not improve from 1.00841
Epoch 10/20

In [27]:

from sklearn.metrics import mean_absolute_error

y_true = test.score_user

print(mean_absolute_error(y_true, model.predict([test.anime_id, test.user_id])))

0.9894957460573737


In [0]:
def recommendByUser(user_id):
  dataUser= test[test.user_id==user_id]
  animesId= dataUser.anime_id.values
  animesName= dataUser.title.values
  score_predicted= model.predict([dataUser.user_id, animesId])
  scores=[]
  for i in range(len(score_predicted)):
    scores.append((animesName[i],np.round(score_predicted[i],2)[0]))
  scores=sorted(scores,key=itemgetter(1), reverse=True)

  return scores
  

In [0]:
from operator import itemgetter
userId=5
scores= recommendByUser(userId)
for i in range(len(scores)):
  print("animeId y predScore: ", scores[i])
  animes=test[test.user_id==userId]
  print("score real:",np.round(animes[animes.title==scores[i][0]].score_user.values[0],2))
  print("")

animeId y predScore:  ('Mushishi Zoku Shou', 9.08)
score real: 8

animeId y predScore:  ('Shigatsu wa Kimi no Uso', 8.82)
score real: 9

animeId y predScore:  ('Usagi Drop', 8.77)
score real: 10

animeId y predScore:  ('Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou', 8.76)
score real: 9

animeId y predScore:  ('Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.', 8.76)
score real: 9

animeId y predScore:  ('Re:Zero kara Hajimeru Isekai Seikatsu', 8.72)
score real: 9

animeId y predScore:  ('Kuroshitsuji', 8.71)
score real: 9

animeId y predScore:  ('Ore Monogatari!!', 8.69)
score real: 9

animeId y predScore:  ('Baccano!', 8.61)
score real: 8

animeId y predScore:  ('Gosick', 8.61)
score real: 9

animeId y predScore:  ('Toradora!', 8.59)
score real: 9

animeId y predScore:  ('Kuroko no Basket 3rd Season', 8.59)
score real: 9

animeId y predScore:  ('One Punch Man', 8.58)
score real: 9

animeId y predScore:  ('D.Gray-man', 8.57)
score real: 9

animeId y predScore:  ('K