<a href="https://colab.research.google.com/github/ClaudioDiporty/recommendation_system_movies_v2/blob/main/recommendation_system_movies_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importação das Bibliotecas



In [121]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from os import error
from scipy.sparse import csr_matrix

# Análise Exploratória dos Dados

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/01-ARQUIVOS GOOGLE COLAB /RECOMMENDATION_SYSTEM/dataset_movies_v2/movies.csv')

In [None]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [None]:
movies.shape

(10329, 3)

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/01-ARQUIVOS GOOGLE COLAB /RECOMMENDATION_SYSTEM/dataset_movies_v2/ratings.csv')

In [None]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807


In [None]:
ratings.shape

(105339, 4)

# Pré Processamento

In [None]:
#Escolhendo as colunas que serão usadas no modelo
ratings = ratings[['userId', 'movieId','rating']]

In [None]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating
0,1,16,4.0
1,1,24,1.5


In [None]:
#Renomeando as colunas
movies = movies.rename(columns={'movieId': 'movie_id', })

In [None]:
movies.head(2)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [None]:
ratings = ratings.rename(columns={'userId':'user_id','movieId':'movie_id' })

In [None]:
ratings.head(2)

Unnamed: 0,user_id,movie_id,rating
0,1,16,4.0
1,1,24,1.5


In [None]:
#Verificando quantidade de Rating por usuário
ratings['user_id'].value_counts()
#Ex: id 668 teve 5678 avaliações 

668    5678
575    2837
458    2086
232    1421
310    1287
       ... 
58       20
51       20
288      20
388      20
257      20
Name: user_id, Length: 668, dtype: int64

In [None]:
#Trazendo os ratings que teve mais de 10 avaliações de filmes
x = ratings['user_id'].value_counts() > 10

In [None]:
#Quantidade de usuário que fizeram mais de 10 avaliações de filmes
y = x[x].index
print(y.shape)

(668,)


In [None]:
#Fazendo um Filtro
#Trazendo ratings somente dos usários que avaliaram mais de 10 filmes
ratings = ratings[ratings['user_id'].isin(y)]     #.isin() Função verifica se os valores estão contidos nas Séries. 
                                                  #Ele retorna uma série booleana mostrando se cada elemento na série 
                                                  #corresponde exatamente a um elemento na sequência de valores passada.

# Juntando as tabelas (Join ou Merge)

In [None]:
#Juntando tabela de filmes com tabela de rating
ratings_with_movies = movies.merge(ratings, on='movie_id')  # on= -> colocar  parâmetro que são comum nas duas tabelas

In [None]:
ratings_with_movies = ratings_with_movies[['user_id','movie_id','title','genres','rating']]

In [None]:
ratings_with_movies.head(3)

Unnamed: 0,user_id,movie_id,title,genres,rating
0,2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0
1,5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
2,8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0


In [None]:
ratings_with_movies.shape

(105339, 5)

In [None]:
#Criar uma coluna que mostra quantidade de vezes que o filme foi avaliado
#Contar a quantidade de rating que teve nas avaliações
number_rating = ratings_with_movies.groupby('title')['rating'].count().reset_index()

In [None]:
number_rating.head(3)

Unnamed: 0,title,rating
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),1


In [None]:
#Renomear a tabela rating da number_rating
number_rating.rename(columns={'rating': 'number_of_ratings'}, inplace=True)

In [None]:
number_rating.head(3)

Unnamed: 0,title,number_of_ratings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),1


In [None]:
#Juntar numa Tabela Final a tabela 'ratings_with_movies' e a tabela 'number_rating'
final_rating = ratings_with_movies.merge(number_rating, on='title')

In [None]:
final_rating.head(5)

Unnamed: 0,user_id,movie_id,title,genres,rating,number_of_ratings
0,2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,232
1,5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,232
2,8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,232
3,11,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,232
4,14,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,232


In [None]:
#Recolando as colunas para uma fácil visualização
final_rating = final_rating[['user_id', 'movie_id', 'rating', 'title', 'genres', 'number_of_ratings' ]]

In [None]:
final_rating.head(3)

Unnamed: 0,user_id,movie_id,rating,title,genres,number_of_ratings
0,2,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,232
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,232
2,8,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,232


In [None]:
final_rating.shape

(105339, 6)

In [None]:
#Fazendo Filtro
#Filtrar somente filmes que tiveram pelo menos 50 avaliações
final_rating = final_rating[final_rating['number_of_ratings'] >= 50]

In [None]:
final_rating.shape

(43566, 6)

In [None]:
#Descartar os valores duplicados, para não ter usuário avaliando o mesmo filmes várias vezes
final_rating.drop_duplicates(['user_id', 'title'], inplace=True)

In [None]:
final_rating.shape

(43566, 6)

# Transpor a Tabela Final usando pivot_table
#### transporos usuário em colunas, pois as avaliações dadas serão as variáveis da maquina preditiva

In [None]:
movies_pivot = final_rating.pivot(columns='user_id', index='title', values='rating')

In [None]:
movies_pivot.head(5)

user_id,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),,,,,,5.0,,,,5.0,...,2.5,,,,,,,4.0,,2.0
12 Angry Men (1957),,,,5.0,,,,,,,...,4.5,,,,,,,5.0,,4.5
2001: A Space Odyssey (1968),,,,,,,,,,,...,,,,5.0,,,,,,3.0
28 Days Later (2002),,,,,,,2.5,,,,...,,,,,,,,,,3.5
300 (2007),,,,,,,,,,,...,3.5,,,,,,,,,2.5


In [None]:
movies_pivot.shape

(455, 666)

In [None]:
#Substituir os valores NaN por zeros
movies_pivot.fillna(0, inplace=True)   #fillna substitue valores nulos por outros valores

In [None]:
movies_pivot.head(5)

user_id,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.5
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5


In [122]:
#Converter a tabela final, para uma matriz esparsa
#tabela de treinamento
movies_sparse = csr_matrix(movies_pivot)

# Criação do Sistema de Recomendação

In [139]:
#Executando o Treinamento do Algoritmo
model = NearestNeighbors(algorithm= 'brute', metric='manhattan', )
model.fit(movies_sparse)

NearestNeighbors(algorithm='brute', metric='manhattan')

In [140]:
movies_pivot.shape

(455, 666)

In [141]:
movies_pivot.head(455)

user_id,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.5
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wizard of Oz, The (1939)",0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,5.0
X-Men (2000),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0
X2: X-Men United (2003),0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
You've Got Mail (1998),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.5,3.0


In [142]:
distances, suggestions = model.kneighbors(movies_pivot.iloc[452, :].values.reshape(1,-1))

In [143]:
for i in range(len(suggestions)):
  print(movies_pivot.index[suggestions[i]])

Index(['X2: X-Men United (2003)', 'Spider-Man 2 (2004)',
       'Matrix Revolutions, The (2003)', 'Signs (2002)',
       'Star Wars: Episode III - Revenge of the Sith (2005)'],
      dtype='object', name='title')
