# Sistemas de recomendação não personalizada para filmes

Os datasets em questão podem ser encontrados nos links a seguir:

https://drive.google.com/file/d/1eqcn9uc0oEQxmYxT9lFqqRdVQAJfOpHC/view?usp=drive_link

https://drive.google.com/open?id=1GLDArjzPbdy6WFcpBNX5XfSFf3XW4kxE&usp=drive_copy

https://drive.google.com/file/d/1LWSCE7O-trTMfs8x8GJSkmdmVF85NEyH/view?usp=drive_link

# Importação de bibliotecas e módulos

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re
from datetime import datetime

# Importação e visualização de dados

In [None]:
movies = pd.read_parquet('/content/movies.parquet')
users = pd.read_parquet('/content/users.parquet')
ratings = pd.read_parquet('/content/ratings.parquet')

In [None]:
movies

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [None]:
ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
users

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


# Agrupamento dos dados numa única tabela

In [None]:
data = movies.merge(ratings,on='item_id')

In [None]:
data = data.merge(users,on='user_id')

In [None]:
data

Unnamed: 0,item_id,title,genres,user_id,rating,timestamp,gender,age,occupation,zip_code
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008,F,50,9,55117
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496,M,25,12,11413
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952,M,25,17,61614
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474,F,35,1,95370
...,...,...,...,...,...,...,...,...,...,...
1000204,3952,"Contender, The (2000)",Drama|Thriller,5812,4,992072099,F,25,7,92120
1000205,3952,"Contender, The (2000)",Drama|Thriller,5831,3,986223125,M,25,1,92120
1000206,3952,"Contender, The (2000)",Drama|Thriller,5837,4,1011902656,M,25,7,60607
1000207,3952,"Contender, The (2000)",Drama|Thriller,5927,1,979852537,M,35,14,10003


# Obtendo informações gerais dos dados

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   item_id     1000209 non-null  int64 
 1   title       1000209 non-null  object
 2   genres      1000209 non-null  object
 3   user_id     1000209 non-null  int64 
 4   rating      1000209 non-null  int64 
 5   timestamp   1000209 non-null  int64 
 6   gender      1000209 non-null  object
 7   age         1000209 non-null  int64 
 8   occupation  1000209 non-null  int64 
 9   zip_code    1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 76.3+ MB


In [None]:
data.describe()

Unnamed: 0,item_id,user_id,rating,timestamp,age,occupation
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,1865.54,3024.512,3.581564,972243700.0,29.73831,8.036138
std,1096.041,1728.413,1.117102,12152560.0,11.75198,6.531336
min,1.0,1.0,1.0,956703900.0,1.0,0.0
25%,1030.0,1506.0,3.0,965302600.0,25.0,2.0
50%,1835.0,3070.0,4.0,973018000.0,25.0,7.0
75%,2770.0,4476.0,4.0,975220900.0,35.0,14.0
max,3952.0,6040.0,5.0,1046455000.0,56.0,20.0


# Criando uma coluna de data a partir da coluna timestamp

In [None]:
def convert_timestamp(x):
  y = datetime.fromtimestamp(x).date()
  return y

data['date'] = data['timestamp'].apply(convert_timestamp)
data

Unnamed: 0,item_id,title,genres,user_id,rating,timestamp,gender,age,occupation,zip_code,date
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067,2001-01-06
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008,F,50,9,55117,2000-12-31
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496,M,25,12,11413,2000-12-31
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952,M,25,17,61614,2000-12-31
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474,F,35,1,95370,2000-12-31
...,...,...,...,...,...,...,...,...,...,...,...
1000204,3952,"Contender, The (2000)",Drama|Thriller,5812,4,992072099,F,25,7,92120,2001-06-09
1000205,3952,"Contender, The (2000)",Drama|Thriller,5831,3,986223125,M,25,1,92120,2001-04-02
1000206,3952,"Contender, The (2000)",Drama|Thriller,5837,4,1011902656,M,25,7,60607,2002-01-24
1000207,3952,"Contender, The (2000)",Drama|Thriller,5927,1,979852537,M,35,14,10003,2001-01-18


# Extraindo o ano de lançamento e o título a partir da coluna de lançamento e colocando a coluna de gêneros em formato de lista

In [None]:
def ano(x):
  padrao = re.compile('\d\d\d\d')
  y = re.findall(padrao,x)
  return int(y[0])

def titulo(x):
    y = x.split('(')[0].strip()
    return y

def listador(x):
  return x.split('|')

data['year'] = data['title'].apply(ano)
data['title'] = data['title'].apply(titulo)
data['genres'] = data['genres'].apply(listador)

data

Unnamed: 0,item_id,title,genres,user_id,rating,timestamp,gender,age,occupation,zip_code,date,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1,5,978824268,F,1,10,48067,2001-01-06,1995
1,1,Toy Story,"[Animation, Children's, Comedy]",6,4,978237008,F,50,9,55117,2000-12-31,1995
2,1,Toy Story,"[Animation, Children's, Comedy]",8,4,978233496,M,25,12,11413,2000-12-31,1995
3,1,Toy Story,"[Animation, Children's, Comedy]",9,5,978225952,M,25,17,61614,2000-12-31,1995
4,1,Toy Story,"[Animation, Children's, Comedy]",10,5,978226474,F,35,1,95370,2000-12-31,1995
...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,3952,"Contender, The","[Drama, Thriller]",5812,4,992072099,F,25,7,92120,2001-06-09,2000
1000205,3952,"Contender, The","[Drama, Thriller]",5831,3,986223125,M,25,1,92120,2001-04-02,2000
1000206,3952,"Contender, The","[Drama, Thriller]",5837,4,1011902656,M,25,7,60607,2002-01-24,2000
1000207,3952,"Contender, The","[Drama, Thriller]",5927,1,979852537,M,35,14,10003,2001-01-18,2000


## Extraindo os top N filmes mais vistos

In [None]:
                                                    def topNsaw(df:'dataframe',N:'number of movies in output'):
  data2 = (
          data.groupby('title')['user_id'].count().reset_index()
          .sort_values(by='user_id',ascending=False)
          )
  data2.columns = ['title','number_ratings']
  return data2.head(N)

print('Os dez filmes mais vistos são:')
display(topNsaw(data,10))

Os dez filmes mais vistos são:


Unnamed: 0,title,number_ratings
126,American Beauty,3428
3110,Star Wars: Episode IV - A New Hope,2991
3111,Star Wars: Episode V - The Empire Strikes Back,2990
3112,Star Wars: Episode VI - Return of the Jedi,2883
1768,Jurassic Park,2672
2854,Saving Private Ryan,2653
3250,Terminator 2: Judgment Day,2649
2084,"Matrix, The",2590
256,Back to the Future,2583
2947,"Silence of the Lambs, The",2578


## Top N - Extraindo os N filmes mais bem avaliados com um número de avaliações maior que um certo valor Nmin

In [None]:
def topNratings(df:'dataframe',N:'number of films in output',Nmin:'minimal number of ratings'):
  x = df.groupby('title')['rating'].count().reset_index()
  x.columns = ['title','number_ratings']
  data2 = data.merge(x,on='title')
  data2
  data3 = data2[data2['number_ratings']>=Nmin]
  data3
  data4 = (
    data3.groupby('title')['rating'].mean().reset_index()
    .sort_values(by='rating',ascending=False)
  )
  return data4.head(N)

print('Os dez filmes mais bem avaliados com mais de 100 avaliações são:')
display(topNratings(data,10,100))


Os dez filmes mais bem avaliados com mais de 100 avaliações são:


Unnamed: 0,title,rating
1585,Seven Samurai,4.56051
1603,"Shawshank Redemption, The",4.554558
736,"Godfather, The",4.524966
381,"Close Shave, A",4.520548
1889,"Usual Suspects, The",4.517106
1567,Schindler's List,4.510417
1975,"Wrong Trousers, The",4.507937
1744,Sunset Blvd.,4.491489
1451,Raiders of the Lost Ark,4.477725
1468,Rear Window,4.47619
