In [31]:
import pandas as pd
import numpy as np
import os

In [32]:
# Caminhos das pastas
RAW_PATH = "../dados/raw/"
PROCESSED_PATH = "../dados/processed/"

In [33]:
# Carregamento dos arquivos brutos
movies = pd.read_csv(os.path.join(RAW_PATH, "movies_metadata.csv"), low_memory=False)
credits = pd.read_csv(os.path.join(RAW_PATH, "credits.csv"))
#ratings = pd.read_csv(os.path.join(RAW_PATH, "ratings.csv"))

# Exibe informações básicas
print("Movies shape:", movies.shape)
print("Credits shape:", credits.shape)
#print("Ratings shape:", ratings.shape)

Movies shape: (45466, 24)
Credits shape: (45476, 3)


## Movies

### Selecionando Colunas

In [34]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [35]:
cols_interesse = [
    "id", "title", "release_date", "runtime", "budget", "revenue",
    "popularity", "vote_average", "vote_count", "original_language", "status"
]
movies = movies[cols_interesse]
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 45466 non-null  object 
 1   title              45460 non-null  object 
 2   release_date       45379 non-null  object 
 3   runtime            45203 non-null  float64
 4   budget             45466 non-null  object 
 5   revenue            45460 non-null  float64
 6   popularity         45461 non-null  object 
 7   vote_average       45460 non-null  float64
 8   vote_count         45460 non-null  float64
 9   original_language  45455 non-null  object 
 10  status             45379 non-null  object 
dtypes: float64(4), object(7)
memory usage: 3.8+ MB


### Conversão de tipos

In [36]:
movies["id"] = movies["id"].astype(int)
movies["release_date"] = pd.to_datetime(movies["release_date"], errors="coerce")
movies["budget"] = pd.to_numeric(movies["budget"], errors="coerce")
movies["revenue"] = pd.to_numeric(movies["revenue"], errors="coerce")
movies["popularity"] = pd.to_numeric(movies["popularity"], errors="coerce")
movies.info()

ValueError: invalid literal for int() with base 10: '1997-08-20'

In [37]:
# Verificar valores não numéricos na coluna 'id'
invalid_ids = movies[~movies["id"].astype(str).str.isdigit()]
print("Linhas com id inválido:", len(invalid_ids))
display(invalid_ids[["id", "title", "release_date"]].head(10))

Linhas com id inválido: 3


Unnamed: 0,id,title,release_date
19730,1997-08-20,,1
29503,2012-09-29,,12
35587,2014-01-01,,22


In [38]:
# Mantém apenas linhas cujo id é totalmente numérico
movies = movies[movies["id"].astype(str).str.isdigit()].copy()

# Realizar conversão
movies["id"] = movies["id"].astype(int)
movies["release_date"] = pd.to_datetime(movies["release_date"], errors="coerce")
movies["budget"] = pd.to_numeric(movies["budget"], errors="coerce")
movies["revenue"] = pd.to_numeric(movies["revenue"], errors="coerce")
movies["popularity"] = pd.to_numeric(movies["popularity"], errors="coerce")
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45463 entries, 0 to 45465
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 45463 non-null  int64         
 1   title              45460 non-null  object        
 2   release_date       45376 non-null  datetime64[ns]
 3   runtime            45203 non-null  float64       
 4   budget             45463 non-null  int64         
 5   revenue            45460 non-null  float64       
 6   popularity         45460 non-null  float64       
 7   vote_average       45460 non-null  float64       
 8   vote_count         45460 non-null  float64       
 9   original_language  45452 non-null  object        
 10  status             45379 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 4.2+ MB


### Valores Ausentes

In [39]:
movies = movies.dropna().reset_index(drop=True)
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45043 entries, 0 to 45042
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 45043 non-null  int64         
 1   title              45043 non-null  object        
 2   release_date       45043 non-null  datetime64[ns]
 3   runtime            45043 non-null  float64       
 4   budget             45043 non-null  int64         
 5   revenue            45043 non-null  float64       
 6   popularity         45043 non-null  float64       
 7   vote_average       45043 non-null  float64       
 8   vote_count         45043 non-null  float64       
 9   original_language  45043 non-null  object        
 10  status             45043 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 3.8+ MB


In [40]:
movies.describe()

Unnamed: 0,id,release_date,runtime,budget,revenue,popularity,vote_average,vote_count
count,45043.0,45043,45043.0,45043.0,45043.0,45043.0,45043.0,45043.0
mean,107219.271985,1992-05-02 06:29:57.802100224,94.231623,4263376.0,11313040.0,2.946408,5.637648,110.887996
min,2.0,1874-12-09 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0
25%,26231.0,1978-09-14 00:00:00,85.0,0.0,0.0,0.399826,5.0,3.0
50%,59231.0,2001-08-17 00:00:00,95.0,0.0,0.0,1.143067,6.0,10.0
75%,153883.5,2010-12-13 12:00:00,107.0,0.0,0.0,3.737918,6.8,35.0
max,469172.0,2020-12-16 00:00:00,1256.0,380000000.0,2787965000.0,547.488298,10.0,14075.0
std,111636.35112,,38.309228,17500470.0,64620290.0,6.027265,1.896656,493.470332


In [41]:
# Seleciona apenas colunas numéricas
num_cols = movies.select_dtypes(include=["number"]).columns

# Conta quantos zeros há em cada coluna
zero_counts = (movies[num_cols] == 0).sum().sort_values(ascending=False)

print("Contagem de registros com valor 0 em cada coluna numérica:\n")
print(zero_counts)

Contagem de registros com valor 0 em cada coluna numérica:

revenue         37643
budget          36170
vote_average     2828
vote_count       2730
runtime          1517
popularity         38
id                  0
dtype: int64


In [42]:
movies = movies.drop(columns=["budget", "revenue"])

In [43]:
num_cols = ["runtime", "popularity", "vote_average", "vote_count"]
mask_valid = (movies[num_cols] != 0).all(axis=1)
movies = movies[mask_valid].reset_index(drop=True)

In [44]:
movies.describe()

Unnamed: 0,id,release_date,runtime,popularity,vote_average,vote_count
count,41034.0,41034,41034.0,41034.0,41034.0,41034.0
mean,99439.79768,1992-07-06 21:31:50.279280640,98.127236,3.199024,6.022969,121.447897
min,2.0,1874-12-09 00:00:00,1.0,1e-06,0.5,1.0
25%,23920.25,1979-02-09 00:00:00,87.0,0.511563,5.3,4.0
50%,52849.5,2001-10-02 12:00:00,96.0,1.331697,6.1,12.0
75%,132713.5,2010-12-26 12:00:00,108.0,4.315517,6.9,40.0
max,468707.0,2017-09-14 00:00:00,1256.0,547.488298,10.0,14075.0
std,108148.847369,,34.6577,6.244917,1.240943,515.787391


### Outliers

## Credits

In [10]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [11]:
credits.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
