# Coleta de Dados do IMDB

## Download e descompressão dos arquivos:

In [46]:
import os
import wget
import gzip
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import shutil

In [47]:
%matplotlib inline

In [48]:
os.chdir("../microdados")
os.getcwd()

'/home/antero/Documentos/tcc/microdados'

In [49]:
url_download_title = "https://datasets.imdbws.com/title.basics.tsv.gz"
wget.download(url_download_title)

'title.basics.tsv (2).gz'

In [50]:
url_download_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
wget.download(url_download_ratings)

'title.ratings.tsv (2).gz'

In [51]:
with gzip.open('title.basics.tsv.gz', 'rb') as f_in:
    with open('title.basics.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [52]:
with gzip.open('title.ratings.tsv.gz', 'rb') as f_in:
    with open('title.ratings.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

## Lendo dados dos arquivos TSV e importando para DataFrame Pandas

In [53]:
df_imdb_basics = pd.read_csv('title.basics.tsv', sep='\t', na_values='\\N', low_memory=False)
df_imdb_ratings = pd.read_csv('title.ratings.tsv', sep='\t', na_values='\\N', low_memory=False)

### Explorando os DataFrames

In [54]:
df_imdb_basics.shape

(7979322, 9)

In [55]:
df_imdb_ratings.shape

(1161026, 3)

In [56]:
df_imdb_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7979322 entries, 0 to 7979321
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         float64
 5   startYear       float64
 6   endYear         float64
 7   runtimeMinutes  object 
 8   genres          object 
dtypes: float64(3), object(6)
memory usage: 547.9+ MB


In [57]:
df_imdb_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161026 entries, 0 to 1161025
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1161026 non-null  object 
 1   averageRating  1161026 non-null  float64
 2   numVotes       1161026 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 26.6+ MB


In [58]:
df_imdb_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short"


In [59]:
df_imdb_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1708
1,tt0000002,6.1,210
2,tt0000003,6.5,1466
3,tt0000004,6.1,123
4,tt0000005,6.2,2268


### Unindo os DataFrames

In [60]:
df_imdb = df_imdb_basics.merge(df_imdb_ratings, on='tconst', how='inner')

In [61]:
df_imdb.shape

(1161026, 11)

In [62]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1161026 entries, 0 to 1161025
Data columns (total 11 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   tconst          1161026 non-null  object 
 1   titleType       1161026 non-null  object 
 2   primaryTitle    1161026 non-null  object 
 3   originalTitle   1161026 non-null  object 
 4   isAdult         1161026 non-null  float64
 5   startYear       1160838 non-null  float64
 6   endYear         31844 non-null    float64
 7   runtimeMinutes  847069 non-null   object 
 8   genres          1138689 non-null  object 
 9   averageRating   1161026 non-null  float64
 10  numVotes        1161026 non-null  int64  
dtypes: float64(4), int64(1), object(6)
memory usage: 106.3+ MB


In [63]:
df_imdb.describe()

Unnamed: 0,isAdult,startYear,endYear,averageRating,numVotes
count,1161026.0,1160838.0,31844.0,1161026.0,1161026.0
mean,0.02008396,2000.332,2003.512279,6.90279,953.2102
std,2.650343,21.33694,15.831654,1.400439,16124.55
min,0.0,1874.0,1933.0,1.0,5.0
25%,0.0,1993.0,1995.0,6.1,9.0
50%,0.0,2008.0,2008.0,7.1,20.0
75%,0.0,2015.0,2016.0,7.9,79.0
max,2020.0,2021.0,2022.0,10.0,2403003.0


In [64]:
df_imdb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short",5.7,1708
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short",6.1,210
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4,"Animation,Comedy,Romance",6.5,1466
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short",6.1,123
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short",6.2,2268


### Filtrando Dados

#### Filtrtando colunas desnecessárias

In [65]:
dropcols_imdb=['tconst','endYear', 'runtimeMinutes', 'isAdult']
df_imdb.drop(columns=dropcols_imdb, inplace=True)

In [66]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1161026 entries, 0 to 1161025
Data columns (total 7 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   titleType      1161026 non-null  object 
 1   primaryTitle   1161026 non-null  object 
 2   originalTitle  1161026 non-null  object 
 3   startYear      1160838 non-null  float64
 4   genres         1138689 non-null  object 
 5   averageRating  1161026 non-null  float64
 6   numVotes       1161026 non-null  int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 70.9+ MB


In [67]:
df_imdb.describe()

Unnamed: 0,startYear,averageRating,numVotes
count,1160838.0,1161026.0,1161026.0
mean,2000.332,6.90279,953.2102
std,21.33694,1.400439,16124.55
min,1874.0,1.0,5.0
25%,1993.0,6.1,9.0
50%,2008.0,7.1,20.0
75%,2015.0,7.9,79.0
max,2021.0,10.0,2403003.0


#### Filtando linhas desnecessárias

In [68]:
df_imdb['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvSeries', 'tvMovie', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame'], dtype=object)

In [69]:
df_imdb=df_imdb.loc[df_imdb['titleType'].isin(['movie','short','tvMovie','tvShort'])]

In [70]:
df_imdb.shape

(447104, 7)

In [71]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 447104 entries, 0 to 1161023
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   titleType      447104 non-null  object 
 1   primaryTitle   447104 non-null  object 
 2   originalTitle  447104 non-null  object 
 3   startYear      447057 non-null  float64
 4   genres         435583 non-null  object 
 5   averageRating  447104 non-null  float64
 6   numVotes       447104 non-null  int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 27.3+ MB


In [72]:
df_imdb.head()

Unnamed: 0,titleType,primaryTitle,originalTitle,startYear,genres,averageRating,numVotes
0,short,Carmencita,Carmencita,1894.0,"Documentary,Short",5.7,1708
1,short,Le clown et ses chiens,Le clown et ses chiens,1892.0,"Animation,Short",6.1,210
2,short,Pauvre Pierrot,Pauvre Pierrot,1892.0,"Animation,Comedy,Romance",6.5,1466
3,short,Un bon bock,Un bon bock,1892.0,"Animation,Short",6.1,123
4,short,Blacksmith Scene,Blacksmith Scene,1893.0,"Comedy,Short",6.2,2268


In [73]:
df_imdb.startYear.unique()

array([1894., 1892., 1893., 1895., 1896., 1898., 1897., 1900., 1899.,
       1901., 1902., 1903., 1904., 1912., 1907., 1905., 1906., 1908.,
       1910., 1909., 1914., 1911., 1913., 1919., 1916., 1917., 1915.,
       1918., 1936., 1925., 1920., 1921., 1922., 1923., 2019., 1924.,
       1927., 1929., 2000., 1926., 1993., 1935., 2014., 1928., 2004.,
       1942., 1930., 2011., 1931., 1932., 1939., 1937., 1950., 1933.,
       1938., 1951., 1934., 1945., 1946., 1940., 1944., 1947., 1941.,
       1952., 1970., 1957., 1943., 1959., 1948., 2001., 1949., 1953.,
       2008., 1954., 1965., 1983., 1980., 1973., 1961., 1995., 1962.,
       1958., 1955., 1956., 1977., 1960., 1964., 1967., 1963., 1969.,
       1968., 1971., 1972., 1966., 2021., 1976., 1990., 1979., 1974.,
       1981., 2020., 1988., 1978., 1989., 1975., 1986., 1985., 2009.,
       1987., 2010., 2018., 1984., 1982.,   nan, 1991., 1999., 2005.,
       1998., 1992., 2002., 1994., 1996., 2017., 1997., 2016., 2006.,
       2003., 2007.,

In [74]:
df_imdb=df_imdb.dropna(subset=['startYear'])

In [75]:
df_imdb.startYear = df_imdb.startYear.astype(int)

In [76]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 447057 entries, 0 to 1161023
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   titleType      447057 non-null  object 
 1   primaryTitle   447057 non-null  object 
 2   originalTitle  447057 non-null  object 
 3   startYear      447057 non-null  int64  
 4   genres         435537 non-null  object 
 5   averageRating  447057 non-null  float64
 6   numVotes       447057 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 27.3+ MB


df_imdb.startYear.unique()

In [77]:
df_imdb.describe()

Unnamed: 0,startYear,averageRating,numVotes
count,447057.0,447057.0,447057.0
mean,1993.888357,6.391454,2024.896
std,27.064741,1.407064,25006.06
min,1874.0,1.0,5.0
25%,1981.0,5.6,10.0
50%,2005.0,6.5,26.0
75%,2014.0,7.3,126.0
max,2021.0,10.0,2403003.0


### Tratando dados duplicados e ausentes

In [78]:
df_winner = df_winner.drop_duplicates(subset=['film','winner'])
df_winner=df_winner[df_winner.winner.isin([True])]
df_winner.winner.value_counts()

NameError: name 'df_winner' is not defined

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df_imdb.isnull(), yticklabels=False, cbar=False, cmap='viridis').set(title='Gráfico 1 - Mapa de calor de dados ausentes - Dados IMDB');

In [None]:
df_imdb.genres.isna().mean().round(4) * 100

In [None]:
df_imdb.duplicated().sum()

In [None]:
df_imdb.loc[df_imdb.duplicated(), :]

#### Salvando Variáveis em Arquivo

In [None]:
os.chdir("../tratados")
os.getcwd()

In [None]:
with open('imdb.pkl', mode = 'wb') as f:
  pickle.dump(df_imdb, f)