**Data Enigeering**

[Repositorio](https://github.com/soyHenry/PI_ML_OPS/tree/PT?tab=readme-ov-file)

**Se define autoguardado en 60 segundos**

In [None]:
autosave 60

**Se importan las librerías necesarias para el proyecto**

In [2]:
import pandas as pd
import numpy as np
import ast

In [None]:
# Cargar el dataset
df_movies = pd.read_csv('movies_dataset.csv')
df_credits = pd.read_csv('credits.parquet')

In [None]:
df_movies

In [None]:
# Eliminar columnas no utilizadas
columns_to_drop = ['video', 'imdb_id', 'adult', 'original_title', 'poster_path', 'homepage']
df_movies.drop(columns=columns_to_drop, inplace=True)


In [None]:
# Rellenar valores nulos de 'revenue' y 'budget' con 0
df_movies['revenue'] = df_movies['revenue'].fillna(0)
df_movies['budget'] = df_movies['budget'].fillna(0)

In [None]:
# Eliminar filas con valores nulos en 'release_date'
df_movies = df_movies.dropna(subset=['release_date'])

In [None]:
# Asegurar que las fechas están en el formato AAAA-mm-dd
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'], errors='coerce')
df_movies = df_movies.dropna(subset=['release_date'])  # Eliminar filas donde la conversión a datetime falla

In [None]:
# Crear la columna 'release_year'
df_movies['release_year'] = df_movies['release_date'].dt.year

In [None]:
# Change column type to object for column: 'budget'
df_movies = df_movies.astype({'budget': 'int'})

# Crear la columna 'return'
df_movies['return'] = df_movies.apply(lambda row: row['revenue'] / row['budget'] if row['budget'] > 0 else 0, axis=1)

In [None]:

# Desempaquetado de todas las columnas anidadas
datos = []
for indice, fila in df_movies.iterrows():
    # Desanidar belongs_to_collection
    collection_id = None
    collection_name = None
    if pd.notna(fila['belongs_to_collection']):
        collection_data = ast.literal_eval(fila['belongs_to_collection'])
        collection_id = collection_data['id']
        collection_name = collection_data['name']
    
    # Desanidar genres
    if pd.notna(fila['genres']):
        genres_list = ast.literal_eval(fila['genres'])
    else:
        genres_list = []
    
    # Desanidar production_companies
    if pd.notna(fila['production_companies']):
        companies_list = ast.literal_eval(fila['production_companies'])
    else:
        companies_list = []
    
    # Desanidar spoken_languages
    if pd.notna(fila['spoken_languages']):
        languages_list = ast.literal_eval(fila['spoken_languages'])
    else:
        languages_list = []

    # Desanidar production_countries
    if pd.notna(fila['production_countries']):
        countries_list = ast.literal_eval(fila['production_countries'])
    else:
        countries_list = []

    # Crear combinaciones de todos los atributos
    for genre in genres_list:
        for company in companies_list:
            for language in languages_list:
                for country in countries_list:
                    genre_id = genre['id']
                    genre_name = genre['name']
                    company_id = company['id']
                    company_name = company['name']
                    language_iso = language['iso_639_1']
                    language_name = language['name']
                    country_iso = country['iso_3166_1']
                    country_name = country['name']
                    datos.append({
                        'collection_id': collection_id,
                        'collection_name': collection_name,
                        'genre_id': genre_id,
                        'genre_name': genre_name,
                        'company_id': company_id,
                        'company_name': company_name,
                        'language_iso': language_iso,
                        'language_name': language_name,
                        'country_iso': country_iso,
                        'country_name': country_name,
                        **fila.drop(['belongs_to_collection', 'genres', 'production_companies', 'spoken_languages', 'production_countries'])
                    })

df_movies = pd.DataFrame(datos)

In [None]:
df_movies['popularity'].fillna(0, inplace=True)
df_movies['popularity'] = pd.to_numeric(df_movies['popularity'], errors='coerce')

In [None]:
df_movies

In [None]:
df_movies.to_parquet("movies.parquet", index=False)

In [3]:
df_movies = pd.read_parquet('movies.parquet')

**CREDITS**

In [None]:
df_credits.describe

In [None]:
# Desempaquetado de las columnas 'cast' y 'crew'
datos = []
for indice, fila in df_credits.iterrows():
    # Desanidar cast
    if pd.notna(fila['cast']):
        cast_list = ast.literal_eval(fila['cast'])
    else:
        cast_list = []
    
    # Desanidar crew
    if pd.notna(fila['crew']):
        crew_list = ast.literal_eval(fila['crew'])
    else:
        crew_list = []

    # Añadir cada miembro del reparto
    for cast_member in cast_list:
        cast_id = cast_member['cast_id']
        character = cast_member['character']
        credit_id = cast_member['credit_id']
        gender = cast_member['gender']
        actor_id = cast_member['id']
        name = cast_member['name']
        order = cast_member['order']
        profile_path = cast_member['profile_path']
        datos.append({
            'type': 'cast',
            'cast_id': cast_id,
            'character': character,
            'credit_id': credit_id,
            'gender': gender,
            'actor_id': actor_id,
            'name': name,
            'order': order,
            'profile_path': profile_path,
            'movie_id': fila['id']
        })
    
    # Añadir cada miembro del equipo
    for crew_member in crew_list:
        credit_id = crew_member['credit_id']
        department = crew_member['department']
        gender = crew_member['gender']
        crew_id = crew_member['id']
        job = crew_member['job']
        name = crew_member['name']
        profile_path = crew_member['profile_path']
        datos.append({
            'type': 'crew',
            'credit_id': credit_id,
            'department': department,
            'gender': gender,
            'crew_id': crew_id,
            'job': job,
            'name': name,
            'profile_path': profile_path,
            'movie_id': fila['id']
        })

df_desanidado = pd.DataFrame(datos)

In [None]:
# Filtrar solo directores
directores = df_desanidado[(df_desanidado['type'] == 'crew') & (df_desanidado['job'] == 'Director')][['name', 'movie_id']].drop_duplicates()

# Filtrar solo actores
actores = df_desanidado[df_desanidado['type'] == 'cast'][['name', 'character', 'movie_id']]

# Crear DataFrame final
df_final = pd.merge(directores, actores, on='movie_id', suffixes=('_director', '_actor'))

# Seleccionar las columnas requeridas
df_credits = df_final[['name_director', 'name_actor', 'character']]

In [None]:
df_credits.to_parquet("creditsP.parquet", index=False)

In [None]:
df_credits = pd.read_parquet('creditsP.parquet')

se convertira el archivo original de 'credits.csv' a formato parquet para poder ser subido a git hub

In [6]:
df_credits = pd.read_csv('credits.csv')
df_credits.to_parquet("credits.parquet", index=False)