# Objetivo do Projeto: 
## Criar um sistema de recomendação de filmes.

#IMPORTS

In [1]:
# Importando a base de dados de filmes para o colab
!wget -O moviedataset.zip https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip
print('unziping ...')
!unzip -o -j moviedataset.zip 

--2022-07-11 21:24:02--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 160301210 (153M) [application/zip]
Saving to: ‘moviedataset.zip’


2022-07-11 21:24:08 (27.8 MB/s) - ‘moviedataset.zip’ saved [160301210/160301210]

unziping ...
Archive:  moviedataset.zip
  inflating: links.csv               
  inflating: movies.csv              
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: tags.csv                


In [4]:
# Bibliotecas utilizadas
import pandas            as pd
import numpy             as np

import matplotlib.pyplot as plt

from math import sqrt
%matplotlib inline

## Loading data

In [3]:
# Carregando o dataframe de filmes
df_movies = pd.read_csv('movies.csv')

# Carregando o dataframe de notas
df_ratings = pd.read_csv('ratings.csv')

# Visualizando o dataframe de filmes
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#DATA PREPARATION

In [5]:
# Extraindo o ano do filme do título
df_movies['year'] = df_movies.title.str.extract('(\(\d\d\d\d\))',expand=False)

# Removendo o parentese do ano
df_movies['year'] = df_movies.year.str.extract('(\d\d\d\d)',expand=False)

# Removendo os anos do título
df_movies['title'] = df_movies.title.str.replace('(\(\d\d\d\d\))', '')

# Certificando que não há nenhum espaço nos títulos
df_movies['title'] = df_movies['title'].apply(lambda x: x.strip())
df_movies.head()

  


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [6]:
# Separano os gêneros
df_movies['genres'] = df_movies.genres.str.split('|')
df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [7]:
# Dataframe que sera criado a tabela de genero
df_movies_with_gender = df_movies.copy()

# Para cada linha no dataframe, itere através da lista de gêneros e coloque um 1 na coluna correspondente
for index, row in df_movies.iterrows():
    for genre in row['genres']:
        df_movies_with_gender.at[index, genre] = 1

# Preencher os valores NaN com 0 para mostrar que um filme não tem o gênero daquela coluna
df_movies_with_gender = df_movies_with_gender.fillna(0)
df_movies_with_gender.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Visualizando o dataframe de notas
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [9]:
# Removendo a coluna timestamp
df_ratings = df_ratings.drop('timestamp', 1)
df_ratings.head()

  


Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


#RECOMMENDATION

In [10]:
# Criando os dados de input do usuario
user_input = [ {'title':'Interstellar',         'rating':9},
               {'title':'Alien',                'rating':6.7},
               {'title':'Inglourious Basterds', 'rating':8}] 
input_movies = pd.DataFrame(user_input)
input_movies

Unnamed: 0,title,rating
0,Interstellar,9.0
1,Alien,6.7
2,Inglourious Basterds,8.0


In [11]:
# Filtrando os filmes por título
input_id = df_movies[df_movies['title'].isin(input_movies['title'].tolist())]

# Em seguida, mesclá-lo para que possamos obter o movieId. Está implicitamente mesclando por título.
input_movies = pd.merge(input_id , input_movies)

# Descartando informações que não usaremos do dataframe de entrada
input_movies = input_movies.drop('genres', 1).drop('year', 1)

# Final input dataframe
input_movies

  


Unnamed: 0,movieId,title,rating
0,1214,Alien,6.7
1,68157,Inglourious Basterds,8.0
2,109487,Interstellar,9.0


In [12]:
# Filtrando os filmes da entrada
user_movies = df_movies_with_gender[df_movies_with_gender['movieId'].isin(input_movies['movieId'].tolist())]
user_movies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
1188,1214,Alien,"[Horror, Sci-Fi]",1979,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13668,68157,Inglourious Basterds,"[Action, Drama, War]",2009,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
23044,109487,Interstellar,"[Sci-Fi, IMAX]",2014,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Redefinindo o índice para evitar problemas futuros
user_movies = user_movies.reset_index(drop=True)

# Eliminando problemas desnecessários para economizar memória e evitar problemas
user_genre_table = user_movies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
user_genre_table

  """


Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#visualizando as notas
input_movies['rating']

0    6.7
1    8.0
2    9.0
Name: rating, dtype: float64

In [15]:
# Produto escalar para obter pesos
user_profile = user_genre_table.transpose().dot(input_movies['rating'])

# O perfil do usuário
user_profile

Adventure              0.0
Animation              0.0
Children               0.0
Comedy                 0.0
Fantasy                0.0
Romance                0.0
Drama                  8.0
Action                 8.0
Crime                  0.0
Thriller               0.0
Horror                 6.7
Mystery                0.0
Sci-Fi                15.7
IMAX                   9.0
Documentary            0.0
War                    8.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [16]:
# Agora vamos pegar os gêneros de cada filme em nosso dataframe original
genre_table = df_movies_with_gender.set_index(df_movies_with_gender['movieId'])

# E largue as informações desnecessárias
genre_table = genre_table.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genre_table.head()

  """


Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Multiplique os gêneros pelos pesos e, em seguida, obtenha a média ponderada
df_recomendation_table = ((genre_table*user_profile).sum(axis=1))/(user_profile.sum())
df_recomendation_table.head()

movieId
1    0.000000
2    0.000000
3    0.000000
4    0.144404
5    0.000000
dtype: float64

In [19]:
# Classifique nossas recomendações em ordem decrescente
df_recomendation_table= df_recomendation_table.sort_values(ascending=False)

# Basta dar uma olhada nos valores
df_recomendation_table.head()

movieId
79132     0.734657
90249     0.734657
103651    0.734657
103253    0.734657
60684     0.734657
dtype: float64

In [20]:
# A tabela final de recomendação
df_movies.loc[df_movies['movieId'].isin(df_recomendation_table.head(10).keys())]

Unnamed: 0,movieId,title,genres,year
12873,60684,Watchmen,"[Action, Drama, Mystery, Sci-Fi, Thriller, IMAX]",2009
15562,79132,Inception,"[Action, Crime, Drama, Mystery, Sci-Fi, Thrill...",2010
17393,87520,Transformers: Dark of the Moon,"[Action, Adventure, Sci-Fi, War, IMAX]",2011
18030,90249,Real Steel,"[Action, Drama, Sci-Fi, IMAX]",2011
19799,97724,"Glamorous Life of Sachiko Hanai, The (Hatsujô ...","[Action, Comedy, Drama, Fantasy, Mystery, Sci-...",2003
21272,103253,Elysium,"[Action, Drama, Sci-Fi, IMAX]",2013
21403,103651,Tai Chi Hero,"[Action, Comedy, Drama, Fantasy, Sci-Fi, IMAX]",2012
24565,115479,"Whip Hand, The","[Action, Adventure, Crime, Drama, Sci-Fi, Thri...",1951
31062,140293,Mobile Suit Gundam II: Soldiers of Sorrow,"[Action, Adventure, Animation, Drama, Sci-Fi, ...",1981
31063,140295,Mobile Suit Gundam III: Encounters in Space,"[Action, Adventure, Animation, Drama, Sci-Fi, ...",1982
