In [115]:
import pandas as pd
import numpy as np

pd.options.display.max_colwidth
pd.options.display.max_info_columns

100

In [23]:
#Importar o arquivo com os filmes e visualizar as primeiras linhas

#False = Importar o arquivo mais rapido pois ele torna o conteúdo do arquivo como String
movies = pd.read_csv('movies_metadata.csv', low_memory = False)
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [26]:
# Importando o arquivo de avaliação e avaliando as primeiras linhas

rating = pd.read_csv('ratings.csv')
rating.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523


## Pré Processamento dos Dados

In [27]:
# Filtrando somente as colunas necessários e renomeando nome das variáveis

# Selecionar somente as variaveis que utilizarei
movies = movies[['id', 'original_title', 'original_language', 'vote_count']]

#Renomear as variaveis
movies.rename(columns = {'id': 'ID_FILME', 'original_title':'TITULO', 'original_language' : 'LINGUAGEM', 'vote_count' : 'QT_AVALIACOES'}, inplace = True)

movies.head(3)

Unnamed: 0,ID_FILME,TITULO,LINGUAGEM,QT_AVALIACOES
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0


In [28]:
rating = rating[['userId', 'movieId', 'rating']]

rating.rename(columns = {'userId' : 'ID_USUARIO', 'movieId' : 'ID_FILME', 'rating' : 'AVALIACAO'}, inplace = True)

rating.head(3)

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0


In [29]:
# Verificando se há valores nulos

movies.isna().sum()

ID_FILME          0
TITULO            0
LINGUAGEM        11
QT_AVALIACOES     6
dtype: int64

In [31]:
# Removendo valores nulos

movies.dropna(inplace = True)

In [32]:
#Verificando se há valores nulos ainda

movies.isna().sum()

ID_FILME         0
TITULO           0
LINGUAGEM        0
QT_AVALIACOES    0
dtype: int64

In [33]:
#Verificando a quantidade de avaliações por usuario

rating['ID_USUARIO'].value_counts()

45811     18276
8659       9279
270123     7638
179792     7515
228291     7410
          ...  
30155         1
9641          1
164717        1
243426        1
234625        1
Name: ID_USUARIO, Length: 270896, dtype: int64

In [34]:
# Vamos pegar o ID_USUARIO somente de usuários que fizeram mais de 999 avaliações

qt_rating = rating['ID_USUARIO'].value_counts() > 999
y = qt_rating[qt_rating].index
y.shape

(2509,)

In [35]:
# Visualizando os usuarios selecionados
y

Int64Index([ 45811,   8659, 270123, 179792, 228291, 243443,  98415, 229879,
             98787, 172224,
            ...
            227649, 244253, 257117,  30733, 196384,  53075, 220764, 214328,
             14354, 182812],
           dtype='int64', length=2509)

In [36]:
rating.shape

(26024289, 3)

In [37]:
# Pegando as avaliacoes dos usuarios que avaliaram mais de 999 vezes
rating = rating[rating['ID_USUARIO'].isin(y)]

rating.shape

(3844582, 3)

In [40]:
movies.head()

Unnamed: 0,ID_FILME,TITULO,LINGUAGEM,QT_AVALIACOES
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [41]:
# Filmes que possuem somente mais de 999 avaliacoes
movies = movies[movies['QT_AVALIACOES'] > 999]

In [44]:
# Vamos agrupar e visualizar a quantidade de filmes pela linguagem
movies_language = movies['LINGUAGEM'].value_counts()
movies_language.head(15)

en    1100
fr       5
ja       5
it       3
ko       2
pt       1
de       1
es       1
cn       1
sv       1
id       1
Name: LINGUAGEM, dtype: int64

In [45]:
# Selecionar somente os filmes da linguagem EN
movies = movies[movies['LINGUAGEM'] == 'en']

In [48]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID_FILME       1100 non-null   object 
 1   TITULO         1100 non-null   object 
 2   LINGUAGEM      1100 non-null   object 
 3   QT_AVALIACOES  1100 non-null   float64
dtypes: float64(1), object(3)
memory usage: 43.0+ KB


In [49]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3844582 entries, 17291 to 26023521
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   ID_USUARIO  int64  
 1   ID_FILME    int64  
 2   AVALIACAO   float64
dtypes: float64(1), int64(2)
memory usage: 117.3 MB


## Convertendo tipo de variáveis

In [51]:
movies['ID_FILME'] = movies['ID_FILME'].astype(int)
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID_FILME       1100 non-null   int32  
 1   TITULO         1100 non-null   object 
 2   LINGUAGEM      1100 non-null   object 
 3   QT_AVALIACOES  1100 non-null   float64
dtypes: float64(1), int32(1), object(2)
memory usage: 38.7+ KB


In [52]:
movies.shape

(1100, 4)

## Concatenando os dataframes

In [54]:
rating_and_movies = rating.merge(movies, on = 'ID_FILME')
rating_and_movies.head()

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO,TITULO,LINGUAGEM,QT_AVALIACOES
0,229,12,1.0,Finding Nemo,en,6292.0
1,741,12,3.0,Finding Nemo,en,6292.0
2,1932,12,0.5,Finding Nemo,en,6292.0
3,3437,12,2.0,Finding Nemo,en,6292.0
4,3694,12,1.5,Finding Nemo,en,6292.0


In [57]:
rating_and_movies.shape

(189882, 6)

In [58]:
rating_and_movies.isna().sum()

ID_USUARIO       0
ID_FILME         0
AVALIACAO        0
TITULO           0
LINGUAGEM        0
QT_AVALIACOES    0
dtype: int64

In [120]:
rating_and_movies.head(10)


Unnamed: 0,ID_USUARIO,AVALIACAO,TITULO,LINGUAGEM,QT_AVALIACOES
0,229,1.0,Finding Nemo,en,6292.0
1,741,3.0,Finding Nemo,en,6292.0
2,1932,0.5,Finding Nemo,en,6292.0
3,3437,2.0,Finding Nemo,en,6292.0
4,3694,1.5,Finding Nemo,en,6292.0
5,3950,1.0,Finding Nemo,en,6292.0
6,4294,2.0,Finding Nemo,en,6292.0
7,4540,2.0,Finding Nemo,en,6292.0
8,4916,3.0,Finding Nemo,en,6292.0
9,4932,2.0,Finding Nemo,en,6292.0


In [61]:
# Removendo duplicados para evitar usuários que avaliaram o mesmo filme mais de uma vez

rating_and_movies.drop_duplicates(['ID_USUARIO', 'ID_FILME'], inplace = True)

In [62]:
rating_and_movies.shape

(189882, 6)

In [63]:
# Excluindo a variavel ID_FILME
del rating_and_movies['ID_FILME']

In [121]:
rating_and_movies.head(10)

Unnamed: 0,ID_USUARIO,AVALIACAO,TITULO,LINGUAGEM,QT_AVALIACOES
0,229,1.0,Finding Nemo,en,6292.0
1,741,3.0,Finding Nemo,en,6292.0
2,1932,0.5,Finding Nemo,en,6292.0
3,3437,2.0,Finding Nemo,en,6292.0
4,3694,1.5,Finding Nemo,en,6292.0
5,3950,1.0,Finding Nemo,en,6292.0
6,4294,2.0,Finding Nemo,en,6292.0
7,4540,2.0,Finding Nemo,en,6292.0
8,4916,3.0,Finding Nemo,en,6292.0
9,4932,2.0,Finding Nemo,en,6292.0


## Criar uma pivot
movies_pivot = rating_and_movies.pivot_table(columns = 'ID_USUARIO', index = 'TITULO', values= 'AVALIACAO')

movies_pivot.head(200)

In [122]:
movies_pivot.fillna(0, inplace = True)
movies_pivot.head()

ID_USUARIO,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
TITULO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0
127 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.0,0.0,0.0,0.0,0.0
2 Fast 2 Furious,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## CSR_MATRIX do Pacote SciPy

In [96]:
from scipy.sparse import csr_matrix

movies_sparse = csr_matrix(movies_pivot)

type(movies_sparse)

scipy.sparse._csr.csr_matrix

In [97]:
# Algoritmo KNN do SciKit Learn
from sklearn.neighbors import NearestNeighbors

modelo = NearestNeighbors(algorithm = 'brute')
modelo.fit(movies_sparse)

NearestNeighbors(algorithm='brute')

## Previsão de Sugestão de Filmes

In [98]:
# Escolha um filme para ver a previsão

distances, sugestions = modelo.kneighbors(movies_pivot.filter(items = ['2 Fast 2 Furious'], axis = 0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(movies_pivot.index[sugestions[i]])

Index(['2 Fast 2 Furious', 'Bambi', 'The Matrix Reloaded',
       'Brokeback Mountain', 'Lord of War'],
      dtype='object', name='TITULO')
