In [None]:
# CLEAN + FEATURE ENGINEERING
import ast
import pandas as pd
import numpy as np
from datetime import datetime

# copia para no modificar original accidentalmente
df = movies_metadata.copy()

# 1) Eliminar duplicados completos y duplicados por id
df = df.drop_duplicates()
if 'id' in df.columns:
    df = df.drop_duplicates(subset=['id'])

# 2) Asegurar tipos numéricos y reemplazar 0 por NaN en budget/revenue si procede
for c in ['budget', 'revenue', 'runtime', 'popularity', 'vote_average', 'vote_count']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Convertir 0 presupuestos/revenue a NaN (opcional — muchos 0 son "no reportado")
df['budget_missing']  = (df['budget'] == 0) | df['budget'].isna()
df['revenue_missing'] = (df['revenue'] == 0) | df['revenue'].isna()
df.loc[df['budget'] == 0, 'budget'] = np.nan
df.loc[df['revenue'] == 0, 'revenue'] = np.nan

# 3) Fecha y edad de la película
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year']  = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
# edad desde estreno hasta 2025 (o el año actual)
df['movie_age'] = 2025 - df['release_year']

# 4) Extraer main_genre y multi-hot si lo deseas
def parse_genres(x):
    try:
        lst = ast.literal_eval(x)
        return [g.get('name') for g in lst if isinstance(g, dict) and 'name' in g]
    except Exception:
        return []

df['genre_list'] = df['genres'].fillna('[]').apply(parse_genres)
# main genre (primer elemento) y número de géneros
df['main_genre'] = df['genre_list'].apply(lambda L: L[0] if L else 'Unknown')
df['n_genres'] = df['genre_list'].apply(len)

# 5) Extraer director desde crew (credits dataset)
# Primero, construir un mapping id -> crew en credits
credits_map = credits.set_index('id')['crew'].to_dict()  # crew como strings JSON-like
def get_director(crew_str):
    try:
        crew = ast.literal_eval(crew_str)
        for person in crew:
            if person.get('job') == 'Director' or person.get('department') == 'Directing':
                return person.get('name')
    except Exception:
        return None

# aplicar solo si credits_map tiene datos y df['id'] coincide con credits ids
if credits_map:
    df['crew_str'] = df['id'].map(credits_map)  # puede producir NaN
    df['director'] = df['crew_str'].fillna('[]').apply(get_director)
    df['has_director'] = df['director'].notna()
else:
    df['director'] = None
    df['has_director'] = False

# 6) Top-actors count (opcional: contar cuantos actores aparecen)
# Solo si tienes el campo cast en credits_map
def count_actors(cast_str, top_n=3):
    try:
        cast = ast.literal_eval(cast_str)
        return min(len(cast), top_n)
    except Exception:
        return 0

if not credits.empty:
    credits_cast_map = credits.set_index('id')['cast'].to_dict()
    df['cast_str'] = df['id'].map(credits_cast_map)
    df['n_top_cast'] = df['cast_str'].fillna('[]').apply(lambda s: count_actors(s, top_n=3))
else:
    df['n_top_cast'] = 0

# 7) Transformaciones: log1p para variables sesgadas
for c in ['budget','revenue','popularity','vote_count']:
    if c in df.columns:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))  # si NaN -> 0 (puedes ajustar)

# 8) Target classification: high_rating (umbral configurable)
df['high_rating'] = (df['vote_average'] >= 7.0).astype(int)

# 9) Drop columnas que no usarás en modelos directos (overview, poster_path, etc.)
drop_cols = ['overview','poster_path','homepage','status','tagline','crew_str','cast_str']
for c in drop_cols:
    if c in df.columns:
        df.drop(columns=[c], inplace=True)

# Vista final
display(df.head())
print("Filas finales:", df.shape[0])
