# Sistemas de recomendação
### Resumo

Sistemas de recomendação estão como parte do desáfio, entretanto, este será meu primeiro contato com este tipo aplicação.

Este notebook será de estudo, desta forma, vou compartilhar tudo material que eu utilizei para aprender o que eu fizer por aqui

## Tratamentos Iniciais

In [1]:
#Imports iniciais

#Nesse caso, vou utilizar a biblioteca lightfm para desenvolver o sistema.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightfm import LightFM
from scipy.sparse import csr_matrix

  "LightFM was compiled without OpenMP support. "


In [2]:
#Vou usar um dataset oriundo do site my anime list para criar um sistema de recomendação de animes.
df_anime = pd.read_csv('anime.csv')
df_ratings =pd.read_csv('rating.csv')

In [3]:
#alguns caras vieram com rating -1, retirei eles da base
df_ratings = df_ratings[df_ratings['rating'] >0]
df_ratings = df_ratings.reset_index(drop=True)
df_ratings = df_ratings.drop_duplicates()

In [4]:
#Alguns users estão com dois ratings registrados para o mesmo anime, vamos precisar tratar esses casos 
df_ratings.groupby(['user_id','anime_id']).count().reset_index().sort_values('rating',ascending=False)

Unnamed: 0,user_id,anime_id,rating
3637145,42653,2001,2
3637143,42653,1575,2
3637174,42653,11757,2
3637197,42653,22319,2
3637200,42653,23283,2
...,...,...,...
2112409,24542,10049,1
2112408,24542,9969,1
2112407,24542,9745,1
2112406,24542,9690,1


In [5]:
#Um exemplo do que havia dito. Vou considerar o rating de indice maior, partindo do pressuposto que as linhas com maior indice são de avaliações mais recentes, o que não sei se é verdade
#mas irrelevante nesse momento

df_ratings[df_ratings['user_id'] == 3].sort_values('rating')

Unnamed: 0,user_id,anime_id,rating
53,3,12671,3
67,3,19315,3
82,3,23333,4
59,3,16512,5
69,3,20021,5
...,...,...,...
84,3,24415,10
22,3,1535,10
44,3,9989,10
32,3,5114,10


In [6]:
#fazer uma cópia da base por precaução
df_ratings = df_ratings.drop_duplicates(subset=['anime_id','user_id'],keep='last')
df_test = df_ratings

In [7]:
df_test.shape

(6337234, 3)

In [8]:
#vamos diminuir um pouco mais a base e colocar animes que tiveram pelo menos 20 avaliações
#e usuários ativos, que serão usuários que já viram pelo menos 3 animes

anime_votes = df_test.groupby('anime_id')['rating'].agg('count')
active_user = df_test.groupby('user_id')['rating'].agg('count')
user_index = active_user[active_user > 3]
anime_index = anime_votes[anime_votes > 10]


df_test = df_test[df_test['user_id'].isin(user_index)]
df_test = df_test[df_test['anime_id'].isin(anime_index)]
df_test.shape

(16135, 3)

## Gerando e Fitando o Modelo

In [9]:
#agora preciso montar uma matriz de user id, movieid e ratings respectivamente.

df_final = df_test.pivot(index='user_id',columns='anime_id',values='rating')
df_final = df_final.fillna(0)
df_final

anime_id,15,16,17,18,19,20,22,24,25,26,...,9930,10180,10516,10592,11933,11979,12131,12403,14211,22071
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,6.0,0.0,6.0,6.0,0.0,6.0,5.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,8.0,0.0,0.0,0.0,7.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2621,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2632,0.0,0.0,0.0,7.0,0.0,7.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
2689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
n_users, n_items = df_final.shape
csr_final = csr_matrix(df_final.values)

In [11]:
#train and test split
from lightfm.cross_validation import random_train_test_split
cross_val = random_train_test_split(csr_final,.2)

In [12]:
# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(cross_val[0], epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x15d80043148>

### Métricas de Validação

In [13]:
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

print("train precision: %.2f" % precision_at_k(model, cross_val[0], k=5).mean())
print("train auc: %.2f" % auc_score(model, cross_val[0]).mean())
print("\n-------------------------\n")
print("test precision: %.2f" % precision_at_k(model, cross_val[1], k=5).mean())
print("test auc: %.2f" % auc_score(model,cross_val[1]).mean())

train precision: 0.41
train auc: 0.97

-------------------------

test precision: 0.09
test auc: 0.90


### Teste de intuição

In [68]:
user_test_id = 2

list_final = pd.DataFrame({'anime_id':df_final.columns.values,'y_hat':model.predict(user_test_id,np.arange(n_items))})
list_final.sort_values('y_hat',ascending=False,inplace=True)

In [69]:
list_final.join(df_anime.set_index('anime_id')['name'], on='anime_id').head(10)

Unnamed: 0,anime_id,y_hat,name
998,4224,2.411305,Toradora!
179,226,2.359119,Elfen Lied
5,20,2.183173,Naruto
219,269,2.093498,Bleach
584,849,1.889514,Suzumiya Haruhi no Yuuutsu
288,355,1.868657,Shakugan no Shana
289,356,1.797885,Fate/stay night
88,121,1.784742,Fullmetal Alchemist
220,270,1.712625,Hellsing
958,2966,1.678581,Ookami to Koushinryou


In [70]:
test_usuario = df_ratings[df_ratings['user_id'] == 8].sort_values('rating',ascending=False)
test_usuario.join(df_anime.set_index('anime_id')['name'], on='anime_id').head(10)

Unnamed: 0,user_id,anime_id,rating,name
901,8,6702,10,Fairy Tail
906,8,10793,10,Guilty Crown
899,8,269,9,Bleach
900,8,355,9,Shakugan no Shana
903,8,7593,9,Kiss x Sis (TV)
908,8,11757,9,Sword Art Online
909,8,13759,9,Sakurasou no Pet na Kanojo
910,8,15583,9,Date A Live
904,8,8630,8,Hidan no Aria
907,8,11241,7,Brave 10
