# Sistema de Recomendação

In [1]:
# importando pacotes necessários
import pandas as pd
import numpy as np

### Importando as bases de dados
Temos duas bases: uma com os filmes, outra com as classificações dos filmes (ratings)

In [2]:
# Importanto base de dados dos filmes
movies_df = pd.read_csv('./data/movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
# Observando a quantidade de linhas e colunas da nossa base de dados de filmes
movies_df.shape

# temos 9742 filmes

(9742, 2)

In [5]:
# Importando a base de classificação dos filmes (ratings)
rating_df = pd.read_csv('./data/ratings.csv',usecols=['userId', 'movieId', 'rating'],
                      dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
# Observando a quantidade de linhas e colunas da nossa base de dados de classificação de filmes
rating_df.shape

# temos um pouco mais de 100k ratings

(100836, 3)

In [8]:
# Trazendo o título do filme para o database de ratings e juntando num df só
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [10]:
# Retirando os filmes que não tiveram nenhum rating
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])

In [14]:
# Calculando quandos ratings cada filme teve (agrupando por título e contando os ratings)
movie_ratingCount = combine_movie_rating.groupby(by = ['title'])['rating'].count().reset_index()

In [18]:
# Alterando o nome da coluna para total de ratings
movie_ratingCount = movie_ratingCount.rename(columns = {'rating': 'totalRatingCount'})

In [19]:
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [20]:
# Trazendo agora o total de ratings para o df completo
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [21]:
# Vamos dar uma olhada na distribuição de ratings por filme
pd.set_option('display.float_format', lambda x: '%.3f' % x)
movie_ratingCount['totalRatingCount'].describe()

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64

In [24]:
# Vamos filtrar um pouco os filmes com poucos ratings pra não poluir muito nossas recomendações
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')

In [25]:
# Vendo o tamanho que ficou a nossa base
len(rating_popular_movie['title'].unique())

450

In [26]:
# Agora, vamos "pivotar" nossa tabela para que tenhamos os filmes como linhas e os usuários como colunas!
# Os valores são os ratings

movie_features_df = rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [27]:
# Como nossa matriz é muito esparsa (muitos valores zerados e poucos com valor acima de zero),
# é interessante utilizar uma estrutura de dados que seja mais "inteligente" ao armazenar os valores

# Importando o pacote necessário
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

# Importando o pacote necessário
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [28]:
# Vamos pegar um filme aleatório
query_index = np.random.choice(movie_features_df.shape[0])
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
print(f'Filme: {movie_features_df.index[query_index]} - índice {query_index}')

Filme: Inglourious Basterds (2009) - índice 217


In [29]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recomendações para {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, com a distância de {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recomendações para Inglourious Basterds (2009):

1: Django Unchained (2012), com a distância de 0.3470427393913269:
2: Inception (2010), com a distância de 0.35389697551727295:
3: Shutter Island (2010), com a distância de 0.4295194745063782:
4: Dark Knight, The (2008), com a distância de 0.44073379039764404:
5: Social Network, The (2010), com a distância de 0.4667876958847046:
