In [None]:
import pandas as pd
import ast

# Función para desanidar listas y diccionarios
def desanidar_columna(df, columna, key=None):
    def desanidar_elemento(elemento, key):
        if isinstance(elemento, str):
            try:
                elemento = ast.literal_eval(elemento)
            except (ValueError, SyntaxError):
                return None
        if isinstance(elemento, list):
            if key:
                return ', '.join([str(d.get(key, '')) for d in elemento if isinstance(d, dict)])
            return ', '.join([str(d) for d in elemento])
        if isinstance(elemento, dict):
            if key:
                return elemento.get(key, None)
            return str(elemento)
        return None

    return df[columna].apply(lambda x: desanidar_elemento(x, key))

# Especificar los tipos de datos para reducir el uso de memoria
dtype_dict = {
    'budget': 'float32',
    'popularity': 'float32',
    'revenue': 'float32',
    'vote_average': 'float32',
    'vote_count': 'int32'
}

# Cargar los datos de las películas
movies_df = pd.read_csv('datasets/movies_dataset.csv', dtype=dtype_dict)

# Transformaciones en movies_df
movies_df['belongs_to_collection'] = movies_df['belongs_to_collection'].apply(
    lambda x: int(ast.literal_eval(x)['id']) if isinstance(x, str) and ast.literal_eval(x) and isinstance(ast.literal_eval(x), dict) else None
)
movies_df['genres'] = desanidar_columna(movies_df, 'genres', 'name')
movies_df['production_companies'] = desanidar_columna(movies_df, 'production_companies', 'name')
movies_df['production_countries'] = desanidar_columna(movies_df, 'production_countries', 'iso_3166_1')
movies_df['spoken_languages'] = desanidar_columna(movies_df, 'spoken_languages', 'iso_639_1')

# Convertir la columna 'id' en movies_df a numérica y asegurar que sea entera
movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce').astype('Int64')

# Cargar los datos de los créditos
credits_df = pd.read_csv('datasets/credits.csv', dtype={'id': 'Int64'})

# Transformar columnas en credits_df
credits_df['cast'] = desanidar_columna(credits_df, 'cast', 'name')

# Separar actores y directores
credits_df['actors'] = credits_df['cast'].apply(lambda x: ', '.join([actor for actor in x.split(', ')[:5]]))

def extract_directors(crew_str):
    try:
        crew_list = ast.literal_eval(crew_str)
        directors = [member['name'] for member in crew_list if member.get('job') == 'Director']
        return ', '.join(directors)
    except (ValueError, SyntaxError):
        return None

credits_df['directors'] = credits_df['crew'].apply(extract_directors)

credits_df = credits_df[['id', 'actors', 'directors']]

# Convertir la columna 'id' en credits_df a numérica y asegurar que sea entera
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce').astype('Int64')

# Asegurarse de que ambas columnas 'id' sean del mismo tipo antes de la fusión
movies_df['id'] = movies_df['id'].astype('Int64')
credits_df['id'] = credits_df['id'].astype('Int64')

# Unir datasets
merged_df = pd.merge(movies_df, credits_df, on='id', how='left')

# Transformaciones adicionales en merged_df
merged_df['revenue'] = pd.to_numeric(merged_df['revenue'], errors='coerce').fillna(0)
merged_df['budget'] = pd.to_numeric(merged_df['budget'], errors='coerce').fillna(0)
merged_df = merged_df.dropna(subset=['release_date'])
merged_df['release_date'] = pd.to_datetime(merged_df['release_date'], format='%Y-%m-%d', errors='coerce')
merged_df['release_year'] = merged_df['release_date'].dt.year.astype('Int64')

# Convertir las columnas numéricas a los tipos deseados DESPUÉS de las transformaciones
numeric_columns = ['budget', 'popularity', 'revenue', 'vote_average', 'vote_count']
for col in numeric_columns:
    if dtype_dict[col] == 'int32':
        merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce').fillna(0).astype(dtype_dict[col])
    else:
        merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce').astype(dtype_dict[col])

# Calcular el retorno de inversión
merged_df['return'] = merged_df.apply(lambda row: row['revenue'] / row['budget'] if row['budget'] > 0 else 0, axis=1)

# Eliminar columnas no utilizadas
columns_to_drop = ['video', 'imdb_id', 'adult', 'original_title', 'poster_path', 'homepage']
merged_df = merged_df.drop(columns=columns_to_drop)

# Convertir 'belongs_to_collection' a entero, manejando valores nulos
merged_df['belongs_to_collection'] = merged_df['belongs_to_collection'].astype('Int64')

# Guardar el nuevo dataset limpio en formato Parquet
merged_df.to_parquet('rootParaGit/dataset_limpio.parquet', index=False)
