# SISTEMA DE RECOMENDAÇÃO DE FILMES

In [1]:
# Importando os pacotes a serem utilizados
import pandas as pd
import numpy as np

### Tratamento de dados: Filmes

In [2]:
# Importar o arquivo com os filmes e visualizar as primeiras linhas
filmes = pd.read_csv('movies_metadata.csv', low_memory = False)
filmes.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [3]:
# Seleciona somente as variaveis que iremos utilizar
filmes = filmes [['id','original_title','original_language','vote_count']]

# Renomeia as variaveis
filmes.rename(columns = {'id':'ID_FILME','original_title':'TITULO','original_language':'LINGUAGEM','vote_count':'QT_AVALIACOES'}, inplace = True)

# Exibe as primeiras linhas do arquivo tratado
filmes.head()

Unnamed: 0,ID_FILME,TITULO,LINGUAGEM,QT_AVALIACOES
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [4]:
# Verificando se há valores nulos
filmes.isna().sum()

ID_FILME          0
TITULO            0
LINGUAGEM        11
QT_AVALIACOES     6
dtype: int64

In [5]:
# Como são poucos os valores nulos iremos remover porque não terá impacto nenhum
filmes.dropna(inplace = True)

In [7]:
# Verificando se há valores nulos
filmes.isna().sum()

ID_FILME         0
TITULO           0
LINGUAGEM        0
QT_AVALIACOES    0
dtype: int64

In [8]:
# Visualizando o DataFrame Filmes
filmes.head()

Unnamed: 0,ID_FILME,TITULO,LINGUAGEM,QT_AVALIACOES
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [9]:
# Visualizando o DataFrame Filmes
filmes.head()

Unnamed: 0,ID_FILME,TITULO,LINGUAGEM,QT_AVALIACOES
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [10]:
# Vamos agrupar e visualizar a quantidade de filmes pela linguagem
filmes_linguagem = filmes['LINGUAGEM'].value_counts()
filmes_linguagem.head(20)

LINGUAGEM
en    32267
fr     2438
it     1529
ja     1349
de     1080
es      994
ru      826
hi      508
ko      444
zh      409
sv      384
pt      316
cn      313
fi      297
nl      248
da      225
pl      219
tr      150
cs      130
el      113
Name: count, dtype: int64

In [11]:
# Selecionar somente os filmes da linguagem EN (English)
filmes = filmes[filmes['LINGUAGEM'] == 'en']

In [12]:
# Visualizar os tipos de dados das variaveis
filmes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32267 entries, 0 to 45465
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID_FILME       32267 non-null  object 
 1   TITULO         32267 non-null  object 
 2   LINGUAGEM      32267 non-null  object 
 3   QT_AVALIACOES  32267 non-null  float64
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [13]:
# Precisamos converter a variavel ID_FILME em inteiro
filmes['ID_FILME'] = filmes['ID_FILME'].astype(int)

In [14]:
# Verificando a quantidade de filmes pelo tamanho do arquivo
filmes.shape

(32267, 4)

In [15]:
filmes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32267 entries, 0 to 45465
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID_FILME       32267 non-null  int32  
 1   TITULO         32267 non-null  object 
 2   LINGUAGEM      32267 non-null  object 
 3   QT_AVALIACOES  32267 non-null  float64
dtypes: float64(1), int32(1), object(2)
memory usage: 1.1+ MB


### Tratamento de dados: Avaliações

In [16]:
# Importando o arquivo de avaliações e avaliando as primeiras linhas
avaliacoes = pd.read_csv('ratings.csv')
avaliacoes.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [17]:
# Seleciona somente as variaveis que iremos utilizar
avaliacoes = avaliacoes [['userId','movieId','rating']]

# Renomeia as variaveis
avaliacoes.rename(columns = {'userId':'ID_USUARIO','movieId':'ID_FILME','rating':'AVALIACAO'}, inplace = True)

# Exibe as primeiras linhas do arquivo tratado
avaliacoes.head()

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [18]:
# Verificando se há valores nulos
avaliacoes.isna().sum()

ID_USUARIO    0
ID_FILME      0
AVALIACAO     0
dtype: int64

In [19]:
# Verificando a quantidade de avaliacoes por usuarios
avaliacoes['ID_USUARIO'].value_counts()

ID_USUARIO
45811     18276
8659       9279
270123     7638
179792     7515
228291     7410
          ...  
30155         1
9641          1
164717        1
243426        1
234625        1
Name: count, Length: 270896, dtype: int64

In [20]:
# Vamos pegar o ID_USUARIO somente de usuários que fizeram mais de 999 avaliações
qt_avaliacoes = avaliacoes['ID_USUARIO'].value_counts() > 999
y = qt_avaliacoes[qt_avaliacoes].index
y.shape

(2509,)

In [21]:
# Visualizando os usuarios selecionados
y

Index([ 45811,   8659, 270123, 179792, 228291, 243443,  98415, 229879,  98787,
       172224,
       ...
       227649, 244253, 257117,  30733, 196384,  53075, 220764, 214328,  14354,
       182812],
      dtype='int64', name='ID_USUARIO', length=2509)

In [22]:
# visualizando o tamanho do dataset Avaliações
avaliacoes.shape

(26024289, 3)

In [23]:
# Pegando somente avaliacoes dos usuarios que avaliaram mais de 999 vezes
avaliacoes = avaliacoes[avaliacoes['ID_USUARIO'].isin(y)]

In [24]:
# visualizando o tamanho do dataset Avaliações
avaliacoes.shape

(3844582, 3)

In [25]:
# Visualizando os DataFrame Avaliacoes
avaliacoes.head()

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO
17291,229,1,3.0
17292,229,2,3.0
17293,229,4,2.0
17294,229,5,1.0
17295,229,7,2.0


In [26]:
# Visualizar os tipos de dados das variaveis
avaliacoes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3844582 entries, 17291 to 26023521
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   ID_USUARIO  int64  
 1   ID_FILME    int64  
 2   AVALIACAO   float64
dtypes: float64(1), int64(2)
memory usage: 117.3 MB


## Filmes e Avaliações: DataFrames unidos

In [27]:
# Concatenando os dataframes
avaliacoes_e_filmes = avaliacoes.merge(filmes, on = 'ID_FILME')
avaliacoes_e_filmes.head()

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO,TITULO,LINGUAGEM,QT_AVALIACOES
0,229,5,1.0,Four Rooms,en,539.0
1,229,12,1.0,Finding Nemo,en,6292.0
2,229,16,3.0,Dancer in the Dark,en,392.0
3,229,21,3.0,The Endless Summer,en,23.0
4,229,25,4.0,Jarhead,en,776.0


In [28]:
# Verificando a quantidade de filmes com avaliacoes pelo tamanho do arquivo
avaliacoes_e_filmes.shape

(1007919, 6)

In [29]:
# Verificando se há valores nulos
avaliacoes_e_filmes.isna().sum()

ID_USUARIO       0
ID_FILME         0
AVALIACAO        0
TITULO           0
LINGUAGEM        0
QT_AVALIACOES    0
dtype: int64

In [30]:
# Vamos visualizar as primeiras 20 linhas do arquivo
avaliacoes_e_filmes.head(20)

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO,TITULO,LINGUAGEM,QT_AVALIACOES
0,229,5,1.0,Four Rooms,en,539.0
1,229,12,1.0,Finding Nemo,en,6292.0
2,229,16,3.0,Dancer in the Dark,en,392.0
3,229,21,3.0,The Endless Summer,en,23.0
4,229,25,4.0,Jarhead,en,776.0
5,229,70,3.0,Million Dollar Baby,en,2519.0
6,229,77,3.0,Memento,en,4168.0
7,229,85,3.0,Raiders of the Lost Ark,en,3949.0
8,229,93,2.0,Anatomy of a Murder,en,207.0
9,229,106,4.0,Predator,en,2129.0


In [31]:
# Vamos descartar os valores duplicados, para que não tenha problemas de termos o mesmo usuário avaliando o mesmo filme diversas vezes
avaliacoes_e_filmes.drop_duplicates(['ID_USUARIO','ID_FILME'], inplace = True)

In [32]:
# Visualizando se houve alteração na quantidade de registros
avaliacoes_e_filmes.shape

(1007619, 6)

In [33]:
# Vamos excluir a variavel ID_FILME porque não iremos utiliza-la
del avaliacoes_e_filmes['ID_FILME']

In [34]:
# DataFrame sem a variavel ID_FILME
avaliacoes_e_filmes.head(20)

Unnamed: 0,ID_USUARIO,AVALIACAO,TITULO,LINGUAGEM,QT_AVALIACOES
0,229,1.0,Four Rooms,en,539.0
1,229,1.0,Finding Nemo,en,6292.0
2,229,3.0,Dancer in the Dark,en,392.0
3,229,3.0,The Endless Summer,en,23.0
4,229,4.0,Jarhead,en,776.0
5,229,3.0,Million Dollar Baby,en,2519.0
6,229,3.0,Memento,en,4168.0
7,229,3.0,Raiders of the Lost Ark,en,3949.0
8,229,2.0,Anatomy of a Murder,en,207.0
9,229,4.0,Predator,en,2129.0


In [35]:
# Agora precisamos fazer um PIVOT. O que queremos é que cada ID_USUARIO seja uma variavel com o respectivo valor de nota para cada filme avaliado
filmes_pivot = avaliacoes_e_filmes.pivot_table(columns = 'ID_USUARIO', index = 'TITULO', values = 'AVALIACAO')
# Avaliar o arquivo transformado para PIVOT 
filmes_pivot.head(20)

ID_USUARIO,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
TITULO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,,,,2.0,,,,3.0,,,...,,,,,2.0,,,,,
$5 a Day,,,,,,,,,,,...,,,,,2.5,2.0,,,,
'Gator Bait,,,,,,,,,,,...,,,,,,,,,,5.0
'R Xmas,,,3.5,,,,,,,,...,,,,,2.0,,,,3.0,
'Twas the Night Before Christmas,,,,,,,,,,,...,,,,,,,,,,
(A)Sexual,,,,,,,,,,,...,,,,,,,,,,
...And the Pursuit of Happiness,,,,,,,,,,,...,,,,,1.5,,,,,5.0
10 Items or Less,,,,,,,,,,,...,,,,,3.0,,,,,
10 Things I Hate About You,,,,,,,,,,,...,,2.5,,3.0,3.0,,,,,
"10,000 BC",,,,,,,,,,,...,,,,,,,,,,


In [36]:
# Os valores que são nulos iremos preencher com ZERO
filmes_pivot.fillna(0, inplace = True)
filmes_pivot.head()

ID_USUARIO,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
TITULO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
$5 a Day,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.0,0.0,0.0,0.0,0.0
'Gator Bait,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
'R Xmas,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0
'Twas the Night Before Christmas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# Vamos importar o csr_matrix do pacote SciPy
# Esse método possibilita criarmos uma matriz sparsa
from scipy.sparse import csr_matrix
# Vamos transformar o nosso dataset em uma matriz sparsa
filmes_sparse = csr_matrix(filmes_pivot)

In [38]:
# Tipo do objeto
type(filmes_sparse)

scipy.sparse._csr.csr_matrix

In [39]:
# Vamos importar o algoritmo KNN do SciKit Learn
from sklearn.neighbors import NearestNeighbors 

In [40]:
# Criando e treinando o modelo preditivo
modelo = NearestNeighbors(algorithm = 'brute')
modelo.fit(filmes_sparse)

## Previsões de sugestões de filmes

In [41]:
# Toy Story
distances, sugestions = modelo.kneighbors(filmes_pivot.filter(items = ['Toy Story'], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]]) 

Index(['Toy Story', 'Eragon', 'K-PAX',
       'Things to Do in Denver When You're Dead', 'Meet the Fockers'],
      dtype='object', name='TITULO')


In [42]:
# Star Wars
distances, sugestions = modelo.kneighbors(filmes_pivot.filter(items = ['Star Wars'], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]]) 

Index(['Star Wars', 'Aliens vs Predator: Requiem', 'Night on Earth',
       'The Living Daylights', '2001: A Space Odyssey'],
      dtype='object', name='TITULO')


In [43]:
# Como se Fosse a Primeira Vez
distances, sugestions = modelo.kneighbors(filmes_pivot.filter(items = ['50 First Dates'], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]]) 

Index(['50 First Dates', 'Ghost World', 'Adventures in Dinosaur City',
       'Santa Fe Trail', 'Unguarded'],
      dtype='object', name='TITULO')


In [44]:
# Os Caça-Fantasmas
distances, sugestions = modelo.kneighbors(filmes_pivot.filter(items = ['Ghostbusters'], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]]) 

Index(['Ghostbusters', 'Annabel Takes a Tour', 'The Last Outpost',
       'Cheap Thrills', 'Brokeback Mountain'],
      dtype='object', name='TITULO')


# FIM