In [1]:
# ══════════════════════════════════════════════════════════════════
# NOTEBOOK 03b: PROCESAMIENTO CON SPARK
# Disney Data Pipeline - Fase 3
# ══════════════════════════════════════════════════════════════════

import os
import sys
import pickle
import json
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np

import boto3
from botocore.exceptions import ClientError, NoCredentialsError
from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import warnings
warnings.filterwarnings('ignore')

print("✅ Imports completados")
print(f"📅 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✅ Imports completados
📅 2025-10-17 19:41:37


In [2]:
# ══════════════════════════════════════════════════════════════════
# CELDA 2: CARGAR VARIABLES DE AMBIENTE
# ══════════════════════════════════════════════════════════════════

load_dotenv(override=True)

required = {
    'AWS_ACCESS_KEY_ID': 'AWS Access Key',
    'AWS_SECRET_ACCESS_KEY': 'AWS Secret Key',
    'AWS_DEFAULT_REGION': 'AWS Region',
    'S3_BUCKET_NAME': 'S3 Bucket'
}

print("🔍 Verificando configuración:\n")
missing = []

for var, desc in required.items():
    value = os.getenv(var)
    if value:
        masked = f"{value[:4]}...{value[-4:]}" if 'KEY' in var else value
        print(f"✅ {desc:20} : {masked}")
    else:
        print(f"❌ {desc:20} : NO CONFIGURADA")
        missing.append(var)

if missing:
    raise EnvironmentError(f"❌ Faltan variables: {missing}")

print("\n✅ Variables configuradas correctamente")

🔍 Verificando configuración:

✅ AWS Access Key       : AKIA...63PU
✅ AWS Secret Key       : mNaj...3rkr
✅ AWS Region           : us-west-1
✅ S3 Bucket            : xideralaws-curso-fernanda

✅ Variables configuradas correctamente


In [3]:
# ══════════════════════════════════════════════════════════════════
# CELDA 3: CREAR SESIÓN AWS SEGURA
# ══════════════════════════════════════════════════════════════════

def get_aws_session():
    """Crea sesión AWS segura con validación"""
    try:
        session = boto3.Session(
            aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
            aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
            region_name=os.getenv('AWS_DEFAULT_REGION')
        )
        
        # Verificar credenciales con STS
        sts = session.client('sts')
        identity = sts.get_caller_identity()
        
        print("✅ Sesión AWS creada")
        print(f"   Account ID: {identity['Account']}")
        print(f"   Region: {session.region_name}")
        
        return session
    
    except NoCredentialsError:
        raise Exception("❌ Credenciales AWS no encontradas")
    except ClientError as e:
        raise Exception(f"❌ Error de autenticación: {e}")

# Crear sesión y clientes
aws_session = get_aws_session()
s3_client = aws_session.client('s3')

# Configuración S3
S3_BUCKET = os.getenv('S3_BUCKET_NAME')
S3_RAW_PREFIX = 'disney-project/raw'
S3_CLEANED_PREFIX = 'disney-project/cleaned'
S3_FINAL_PREFIX = 'disney-project/final'

print(f"\n✅ S3 configurado: {S3_BUCKET}")

✅ Sesión AWS creada
   Account ID: 020635523025
   Region: us-west-1

✅ S3 configurado: xideralaws-curso-fernanda


In [4]:
# ══════════════════════════════════════════════════════════════════
# CELDA 4: FUNCIÓN PARA SUBIR ARCHIVOS A S3
# ══════════════════════════════════════════════════════════════════

def upload_to_s3(local_file, s3_key):
    """Sube archivo a S3 con encriptación"""
    try:
        s3_client.upload_file(
            Filename=local_file,
            Bucket=S3_BUCKET,
            Key=s3_key,
            ExtraArgs={'ServerSideEncryption': 'AES256'}
        )
        return f"✅ Subido: s3://{S3_BUCKET}/{s3_key}"
    except Exception as e:
        return f"❌ Error: {e}"

print("✅ Función upload_to_s3 definida")

✅ Función upload_to_s3 definida


In [5]:
# ══════════════════════════════════════════════════════════════════
# CELDA 5: CREAR SPARK SESSION
# ══════════════════════════════════════════════════════════════════

print("🔥 Iniciando Spark Session...\n")

spark = SparkSession.builder \
    .appName("Disney Data Pipeline - Fase 3") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print("✅ Spark Session creada")
print(f"   Versión: {spark.version}")
print(f"   App Name: {spark.sparkContext.appName}")
print(f"   Master: {spark.sparkContext.master}")

🔥 Iniciando Spark Session...



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/17 19:44:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✅ Spark Session creada
   Versión: 4.0.1
   App Name: Disney Data Pipeline - Fase 3
   Master: local[*]


In [6]:
# ══════════════════════════════════════════════════════════════════
# CELDA 6: CARGAR DATOS DE FASE 2
# ══════════════════════════════════════════════════════════════════

print("📦 Cargando datos de Fase 2...\n")
print("=" * 80)

# Verificar que existe
if not Path('datos_fase2.pkl').exists():
    raise FileNotFoundError(
        "❌ No se encontró 'datos_fase2.pkl'\n"
        "   Ejecuta primero: 02_limpieza_transformacion.ipynb"
    )

# Cargar pickle
with open('datos_fase2.pkl', 'rb') as f:
    datos_fase2 = pickle.load(f)

print("✅ Pickle cargado")
print(f"\n📋 Contenido:")
for key in datos_fase2.keys():
    print(f"   - {key}")

# Extraer DataFrames
df_movies_pandas = datos_fase2['df_movies_clean']
df_characters_pandas = datos_fase2['df_characters_clean']
df_relations_pandas = datos_fase2['df_relations']

print(f"\n✅ DataFrames extraídos:")
print(f"   🎬 Películas: {len(df_movies_pandas):,} registros, {len(df_movies_pandas.columns)} columnas")
print(f"   👥 Personajes: {len(df_characters_pandas):,} registros, {len(df_characters_pandas.columns)} columnas")
print(f"   🔗 Relaciones: {len(df_relations_pandas):,} registros, {len(df_relations_pandas.columns)} columnas")

📦 Cargando datos de Fase 2...

✅ Pickle cargado

📋 Contenido:
   - df_movies_clean
   - df_characters_clean
   - df_relations
   - metadata

✅ DataFrames extraídos:
   🎬 Películas: 119 registros, 19 columnas
   👥 Personajes: 1,419 registros, 16 columnas
   🔗 Relaciones: 1,068 registros, 3 columnas


In [7]:
# ══════════════════════════════════════════════════════════════════
# CELDA 7: CONVERTIR PANDAS → SPARK DATAFRAMES
# ══════════════════════════════════════════════════════════════════

print("⚡ Convirtiendo a Spark DataFrames...\n")

# ============================================
# 1. PELÍCULAS
# ============================================
print("1️⃣ Convirtiendo películas...")
spark_movies = spark.createDataFrame(df_movies_pandas)
print(f"   ✅ {spark_movies.count():,} registros")
print(f"   Columnas: {len(spark_movies.columns)}")

# ============================================
# 2. PERSONAJES (con manejo de tipos complejos)
# ============================================
print("\n2️⃣ Convirtiendo personajes...")

# Identificar columnas problemáticas (listas, objetos complejos)
problematic_cols = []
for col in df_characters_pandas.columns:
    sample = df_characters_pandas[col].iloc[0]
    if isinstance(sample, (list, dict)):
        problematic_cols.append(col)

if problematic_cols:
    print(f"   ⚠️  Columnas con tipos complejos: {problematic_cols}")
    print(f"   📝 Convirtiendo a string...")
    
    # Crear copia para modificar
    df_chars_clean = df_characters_pandas.copy()
    
    # Convertir columnas problemáticas a string
    for col in problematic_cols:
        df_chars_clean[col] = df_chars_clean[col].astype(str)
    
    # Crear DataFrame Spark con datos limpios
    spark_characters = spark.createDataFrame(df_chars_clean)
else:
    # No hay problemas, conversión directa
    spark_characters = spark.createDataFrame(df_characters_pandas)

print(f"   ✅ {spark_characters.count():,} registros")
print(f"   Columnas: {len(spark_characters.columns)}")

# ============================================
# 3. RELACIONES
# ============================================
if not df_relations_pandas.empty:
    print("\n3️⃣ Convirtiendo relaciones...")
    spark_relations = spark.createDataFrame(df_relations_pandas)
    print(f"   ✅ {spark_relations.count():,} registros")
else:
    print("\n3️⃣ Sin relaciones para convertir")
    spark_relations = None

print("\n✅ Conversión completada exitosamente")

⚡ Convirtiendo a Spark DataFrames...

1️⃣ Convirtiendo películas...


                                                                                

   ✅ 119 registros
   Columnas: 19

2️⃣ Convirtiendo personajes...
   ⚠️  Columnas con tipos complejos: ['films', 'shortFilms', 'tvShows', 'videoGames', 'parkAttractions', 'allies', 'enemies']
   📝 Convirtiendo a string...
   ✅ 1,419 registros
   Columnas: 16

3️⃣ Convirtiendo relaciones...
   ✅ 1,068 registros

✅ Conversión completada exitosamente


In [8]:
# ══════════════════════════════════════════════════════════════════
# CELDA 8: CREAR VISTAS SQL TEMPORALES
# ══════════════════════════════════════════════════════════════════

print("📋 CREANDO VISTAS SQL TEMPORALES\n")
print("=" * 80)

# Registrar DataFrames como tablas SQL
spark_movies.createOrReplaceTempView("movies")
spark_characters.createOrReplaceTempView("characters")
if spark_relations is not None:
    spark_relations.createOrReplaceTempView("relations")

print("✅ Vistas SQL creadas:")
print("   - movies")
print("   - characters")
if spark_relations is not None:
    print("   - relations")

# Mostrar esquema de películas
print("\n📋 Esquema de películas:")
spark_movies.printSchema()

📋 CREANDO VISTAS SQL TEMPORALES

✅ Vistas SQL creadas:
   - movies
   - characters
   - relations

📋 Esquema de películas:
root
 |-- film_title: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- box_office_revenue: string (nullable = true)
 |-- opening_revenue: string (nullable = true)
 |-- release_date: timestamp (nullable = true)
 |-- opening_revenue_over_total_revenue: long (nullable = true)
 |-- imdb_score: double (nullable = true)
 |-- rt_critics_score: long (nullable = true)
 |-- rt_audience_score: long (nullable = true)
 |-- release_year: long (nullable = true)
 |-- release_month: long (nullable = true)
 |-- release_quarter: long (nullable = true)
 |-- release_day_of_week: string (nullable = true)
 |-- box_office_revenue_clean: double (nullable = true)
 |-- decade: long (nullable = true)
 |-- decade_label: string (nullable = true)
 |-- rating_category: string (nullable = true)
 |-- segment: string (nullable = true)
 |-- film_title_clean: string (nullable = true)

In [9]:
# ══════════════════════════════════════════════════════════════════
# CELDA 9: ANÁLISIS CON SPARK SQL
# ══════════════════════════════════════════════════════════════════

print("⚡ ANÁLISIS DISTRIBUIDO CON SPARK SQL\n")
print("=" * 80)

# ============================================
# QUERY 1: Top 10 películas por revenue
# ============================================
print("\n💰 TOP 10 PELÍCULAS POR REVENUE:")
print("-" * 80)

top_revenue = spark.sql("""
    SELECT 
        film_title,
        release_year,
        ROUND(box_office_revenue_clean/1000000, 2) as revenue_millions,
        segment
    FROM movies
    WHERE box_office_revenue_clean IS NOT NULL
    ORDER BY box_office_revenue_clean DESC
    LIMIT 10
""")

top_revenue.show(10, truncate=False)

# ============================================
# QUERY 2: Análisis por década
# ============================================
print("\n📅 ANÁLISIS POR DÉCADA:")
print("-" * 80)

decade_analysis = spark.sql("""
    SELECT 
        decade_label,
        COUNT(*) as num_movies,
        ROUND(AVG(box_office_revenue_clean)/1000000, 2) as avg_revenue_millions,
        ROUND(SUM(box_office_revenue_clean)/1000000, 2) as total_revenue_millions
    FROM movies
    WHERE decade_label IS NOT NULL
    GROUP BY decade_label
    ORDER BY decade_label
""")

decade_analysis.show(truncate=False)

# ============================================
# QUERY 3: Top personajes
# ============================================
print("\n🌟 TOP 10 PERSONAJES MÁS POPULARES:")
print("-" * 80)

top_characters = spark.sql("""
    SELECT 
        name,
        total_appearances,
        num_films,
        num_tv_shows,
        popularity_category
    FROM characters
    WHERE total_appearances > 0
    ORDER BY total_appearances DESC
    LIMIT 10
""")

top_characters.show(10, truncate=False)

print("\n✅ Análisis SQL completado")

⚡ ANÁLISIS DISTRIBUIDO CON SPARK SQL


💰 TOP 10 PELÍCULAS POR REVENUE:
--------------------------------------------------------------------------------
+------------------------------------------+------------+----------------+-------------------------+
|film_title                                |release_year|revenue_millions|segment                  |
+------------------------------------------+------------+----------------+-------------------------+
|Star Wars: Episode VII - The Force Awakens|2015        |936.66          |Éxito Crítico y Comercial|
|Avengers: Endgame                         |2019        |858.37          |Éxito Crítico y Comercial|
|Spider-Man: No Way Home                   |2021        |804.79          |Éxito Crítico y Comercial|
|Black Panther                             |2018        |700.06          |Éxito Crítico y Comercial|
|Avengers: Infinity War                    |2018        |678.82          |Éxito Crítico y Comercial|
|The Avengers                           

In [10]:
# ══════════════════════════════════════════════════════════════════
# CELDA 10: JOIN DISTRIBUIDO - PELÍCULAS CON PERSONAJES
# ══════════════════════════════════════════════════════════════════

print("🔗 REALIZANDO JOIN DISTRIBUIDO\n")
print("=" * 80)

if spark_relations is not None:
    # Paso 1: Contar personajes por película
    print("1️⃣ Contando personajes por película...")
    
    char_count = spark.sql("""
        SELECT 
            movie_title_clean,
            COUNT(DISTINCT character_name) as character_count
        FROM relations
        GROUP BY movie_title_clean
    """)
    
    print(f"   ✅ {char_count.count()} películas con personajes identificados\n")
    
    # Paso 2: JOIN con dataset de películas
    print("2️⃣ Realizando JOIN...")
    
    movies_enriched = spark_movies.join(
        char_count,
        spark_movies['film_title_clean'] == char_count['movie_title_clean'],
        how='left'
    ).select(
        spark_movies['*'],
        F.coalesce(char_count['character_count'], F.lit(0)).alias('character_count')
    )
    
    print(f"   ✅ JOIN completado: {movies_enriched.count():,} registros\n")
    
    # Mostrar películas con más personajes
    print("3️⃣ Top 15 películas con más personajes:")
    print("-" * 80)
    
    movies_enriched.select(
        'film_title',
        'release_year',
        F.col('box_office_revenue_clean').alias('revenue'),
        'character_count'
    ).orderBy(F.col('character_count').desc()).show(15, truncate=False)
    
else:
    print("⚠️  No hay relaciones disponibles")
    movies_enriched = spark_movies.withColumn('character_count', F.lit(0))

print("\n✅ Dataset enriquecido creado")

🔗 REALIZANDO JOIN DISTRIBUIDO

1️⃣ Contando personajes por película...
   ✅ 366 películas con personajes identificados

2️⃣ Realizando JOIN...
   ✅ JOIN completado: 119 registros

3️⃣ Top 15 películas con más personajes:
--------------------------------------------------------------------------------
+-------------------------------+------------+------------+---------------+
|film_title                     |release_year|revenue     |character_count|
+-------------------------------+------------+------------+---------------+
|Ralph Breaks the Internet      |2018        |2.01091711E8|19             |
|Alice Through the Looking Glass|2016        |7.7041381E7 |7              |
|The Jungle Book                |2016        |3.64001123E8|7              |
|Fantasia 2000                  |2000        |6.065542E7  |7              |
|Tangled                        |2010        |2.00821936E8|7              |
|Zootopia                       |2016        |3.41268248E8|6              |
|Home on the R

In [11]:
# ══════════════════════════════════════════════════════════════════
# CELDA 11: AGREGACIONES POR SEGMENTO
# ══════════════════════════════════════════════════════════════════

print("📊 AGREGACIONES POR SEGMENTO\n")
print("=" * 80)

# Agregación por segmento
agg_by_segment = movies_enriched.groupBy('segment').agg(
    F.count('*').alias('num_movies'),
    F.round(F.sum('box_office_revenue_clean'), 2).alias('total_revenue'),
    F.round(F.avg('box_office_revenue_clean'), 2).alias('avg_revenue'),
    F.round(F.avg('character_count'), 1).alias('avg_characters')
).orderBy(F.col('total_revenue').desc())

print("🎯 MÉTRICAS POR SEGMENTO:")
agg_by_segment.show(truncate=False)

# Convertir a Pandas
df_segment_agg = agg_by_segment.toPandas()

print(f"\n✅ Agregación completada: {len(df_segment_agg)} segmentos")

📊 AGREGACIONES POR SEGMENTO

🎯 MÉTRICAS POR SEGMENTO:
+-------------------------+----------+---------------+--------------+--------------+
|segment                  |num_movies|total_revenue  |avg_revenue   |avg_characters|
+-------------------------+----------+---------------+--------------+--------------+
|Éxito Crítico y Comercial|30        |1.4382616427E10|4.7942054757E8|1.0           |
|Éxito Crítico            |43        |7.439824875E9  |1.7301918314E8|1.5           |
|Bajo Rendimiento         |38        |4.512640573E9  |1.1875369929E8|1.2           |
|Éxito Comercial          |8         |3.61623254E9   |4.520290675E8 |0.9           |
+-------------------------+----------+---------------+--------------+--------------+


✅ Agregación completada: 4 segmentos


In [14]:
# ══════════════════════════════════════════════════════════════════
# CELDA 12: AGREGACIONES TEMPORALES
# ══════════════════════════════════════════════════════════════════

print("📅 AGREGACIONES TEMPORALES\n")
print("=" * 80)

# ============================================
# Por AÑO
# ============================================
print("1️⃣ Agregando por año...")

agg_by_year = movies_enriched.groupBy('release_year').agg(
    F.count('*').alias('num_movies'),
    F.round(F.avg('box_office_revenue_clean'), 2).alias('avg_revenue'),
    F.round(F.sum('box_office_revenue_clean'), 2).alias('total_revenue'),
    F.round(F.avg('character_count'), 1).alias('avg_characters')
).orderBy('release_year')

print(f"✅ {agg_by_year.count()} años procesados")

# Mostrar datos
print("\n📊 Datos por año (2000+):")
agg_by_year.filter(F.col('release_year') >= 2000).show(25, truncate=False)

# Convertir a Pandas
df_year_agg = agg_by_year.toPandas()
print(f"✅ Pandas: {len(df_year_agg)} registros\n")

# ============================================
# Por DÉCADA
# ============================================
print("2️⃣ Agregando por década...")

# Verificar si existe decade_label
if 'decade_label' in movies_enriched.columns:
    decade_col = 'decade_label'
else:
    print("   ⚠️  Creando columna decade...")
    movies_enriched = movies_enriched.withColumn(
        'decade',
        F.floor(F.col('release_year') / 10) * 10
    )
    decade_col = 'decade'

agg_by_decade = movies_enriched.groupBy(decade_col).agg(
    F.count('*').alias('num_movies'),
    F.round(F.avg('box_office_revenue_clean'), 2).alias('avg_revenue'),
    F.round(F.sum('box_office_revenue_clean'), 2).alias('total_revenue')
).orderBy(decade_col)

print(f"✅ {agg_by_decade.count()} décadas procesadas")

# Mostrar datos
print("\n📊 Datos por década:")
agg_by_decade.show(truncate=False)

# Convertir a Pandas
df_decade_agg = agg_by_decade.toPandas()
print(f"✅ Pandas: {len(df_decade_agg)} registros")

print("\n✅ CELDA 12 COMPLETADA")

📅 AGREGACIONES TEMPORALES

1️⃣ Agregando por año...
✅ 33 años procesados

📊 Datos por año (2000+):
+------------+----------+--------------+-------------+--------------+
|release_year|num_movies|avg_revenue   |total_revenue|avg_characters|
+------------+----------+--------------+-------------+--------------+
|2000        |3         |7.230504433E7 |2.16915133E8 |4.0           |
|2001        |2         |1.69964861E8  |3.39929722E8 |0.5           |
|2002        |2         |9.19855605E7  |1.83971121E8 |3.0           |
|2003        |2         |2.125256275E8 |4.25051255E8 |1.0           |
|2004        |2         |1.557357765E8 |3.11471553E8 |3.0           |
|2005        |2         |7.74323855E7  |1.54864771E8 |0.0           |
|2006        |1         |2.44082982E8  |2.44082982E8 |0.0           |
|2007        |2         |1.521339125E8 |3.04267825E8 |1.0           |
|2008        |2         |1.689308715E8 |3.37861743E8 |0.0           |
|2009        |3         |1.7842030867E8|5.35260926E8 |1.3    

In [15]:
# ══════════════════════════════════════════════════════════════════
# CELDA 13: GUARDAR EN FORMATO PARQUET
# ══════════════════════════════════════════════════════════════════

from pathlib import Path

print("💾 GUARDANDO ARCHIVOS PARQUET\n")
print("=" * 80)

Path('./spark_output').mkdir(exist_ok=True)

print("1️⃣ Guardando movies_enriched...")
movies_enriched.write.mode('overwrite').parquet('./spark_output/movies_enriched.parquet')
print("   ✅ Guardado")

print("\n2️⃣ Guardando agg_by_segment...")
agg_by_segment.write.mode('overwrite').parquet('./spark_output/agg_segment.parquet')
print("   ✅ Guardado")

print("\n3️⃣ Guardando agg_temporal...")
agg_by_year.write.mode('overwrite').parquet('./spark_output/agg_temporal.parquet')
print("   ✅ Guardado")

print("\n4️⃣ Guardando agg_decade...")
agg_by_decade.write.mode('overwrite').parquet('./spark_output/agg_decade.parquet')
print("   ✅ Guardado")

print("\n✅ Todos los archivos Parquet guardados en: ./spark_output/")

💾 GUARDANDO ARCHIVOS PARQUET

1️⃣ Guardando movies_enriched...


                                                                                

   ✅ Guardado

2️⃣ Guardando agg_by_segment...
   ✅ Guardado

3️⃣ Guardando agg_temporal...
   ✅ Guardado

4️⃣ Guardando agg_decade...
   ✅ Guardado

✅ Todos los archivos Parquet guardados en: ./spark_output/


In [16]:
# ══════════════════════════════════════════════════════════════════
# CELDA 14: CONVERTIR A CSV Y SUBIR A S3
# ══════════════════════════════════════════════════════════════════

print("☁️  EXPORTANDO A CSV Y SUBIENDO A S3\n")
print("=" * 80)

Path('./data/final').mkdir(parents=True, exist_ok=True)

# 1. Movies enriched
print("1️⃣ Exportando movies_enriched...")
df_movies_final = movies_enriched.toPandas()
movies_csv = './data/final/movies_spark.csv'
df_movies_final.to_csv(movies_csv, index=False, encoding='utf-8')
result = upload_to_s3(movies_csv, f'{S3_FINAL_PREFIX}/movies_spark.csv')
print(f"   {result}")

# 2. Segment
print("\n2️⃣ Exportando agg_segment...")
segment_csv = './data/final/agg_segment.csv'
df_segment_agg.to_csv(segment_csv, index=False, encoding='utf-8')
result = upload_to_s3(segment_csv, f'{S3_FINAL_PREFIX}/agg_segment.csv')
print(f"   {result}")

# 3. Temporal
print("\n3️⃣ Exportando agg_temporal...")
temporal_csv = './data/final/agg_temporal.csv'
df_year_agg.to_csv(temporal_csv, index=False, encoding='utf-8')
result = upload_to_s3(temporal_csv, f'{S3_FINAL_PREFIX}/agg_temporal.csv')
print(f"   {result}")

# 4. Decade
print("\n4️⃣ Exportando agg_decade...")
decade_csv = './data/final/agg_decade.csv'
df_decade_agg.to_csv(decade_csv, index=False, encoding='utf-8')
result = upload_to_s3(decade_csv, f'{S3_FINAL_PREFIX}/agg_decade.csv')
print(f"   {result}")

print(f"\n🎉 Todos los archivos subidos a S3")
print(f"   Ubicación: s3://{S3_BUCKET}/{S3_FINAL_PREFIX}/")

☁️  EXPORTANDO A CSV Y SUBIENDO A S3

1️⃣ Exportando movies_enriched...
   ✅ Subido: s3://xideralaws-curso-fernanda/disney-project/final/movies_spark.csv

2️⃣ Exportando agg_segment...
   ✅ Subido: s3://xideralaws-curso-fernanda/disney-project/final/agg_segment.csv

3️⃣ Exportando agg_temporal...
   ✅ Subido: s3://xideralaws-curso-fernanda/disney-project/final/agg_temporal.csv

4️⃣ Exportando agg_decade...
   ✅ Subido: s3://xideralaws-curso-fernanda/disney-project/final/agg_decade.csv

🎉 Todos los archivos subidos a S3
   Ubicación: s3://xideralaws-curso-fernanda/disney-project/final/


In [17]:
# ══════════════════════════════════════════════════════════════════
# CELDA 15: GUARDAR DATOS PARA DASHBOARD
# ══════════════════════════════════════════════════════════════════

import pickle
from datetime import datetime

print("💾 PREPARANDO DATOS PARA DASHBOARD\n")
print("=" * 80)

datos_fase3 = {
    'df_movies_dashboard': df_movies_final,
    'df_characters_final': df_characters_pandas,
    'segment_analysis': df_segment_agg,
    'temporal_analysis': df_year_agg,
    'decade_analysis': df_decade_agg,
    'rankings': {
        'top_revenue': df_movies_final.nlargest(10, 'box_office_revenue_clean'),
    },
    'metadata': {
        'movies_count': len(df_movies_final),
        'characters_count': len(df_characters_pandas),
        'timestamp': datetime.now().isoformat(),
        'notebook': '03b_procesamiento_spark.ipynb',
        'spark_version': spark.version,
        'status': 'SUCCESS'
    }
}

with open('datos_fase3.pkl', 'wb') as f:
    pickle.dump(datos_fase3, f)

print("✅ Datos guardados en: datos_fase3.pkl")
print(f"   Películas: {datos_fase3['metadata']['movies_count']}")
print(f"   Personajes: {datos_fase3['metadata']['characters_count']}")

💾 PREPARANDO DATOS PARA DASHBOARD

✅ Datos guardados en: datos_fase3.pkl
   Películas: 119
   Personajes: 1419


In [18]:
# ══════════════════════════════════════════════════════════════════
# CELDA 16: RESUMEN FINAL Y CERRAR SPARK
# ══════════════════════════════════════════════════════════════════

print("\n" + "="*70)
print("🎉 PROCESAMIENTO SPARK COMPLETADO")
print("="*70)

print(f"\n⚡ SPARK PROCESSING:")
print(f"   Versión: {spark.version}")
print(f"   Películas: {movies_enriched.count():,}")
print(f"   Personajes: {spark_characters.count():,}")
if spark_relations:
    print(f"   Relaciones: {spark_relations.count():,}")

print(f"\n📊 ARCHIVOS GENERADOS:")
print(f"   Parquet: ./spark_output/ (4 archivos)")
print(f"   CSV + S3: s3://{S3_BUCKET}/{S3_FINAL_PREFIX}/ (4 archivos)")
print(f"   Pickle: datos_fase3.pkl")

print(f"\n🚀 SIGUIENTE PASO:")
print(f"   Ejecutar: streamlit run dashboard_disney.py")
print("="*70)

spark.stop()
print("\n✅ Spark Session cerrada")
print("✅ Notebook 03b completado al 100%")


🎉 PROCESAMIENTO SPARK COMPLETADO

⚡ SPARK PROCESSING:
   Versión: 4.0.1
   Películas: 119
   Personajes: 1,419
   Relaciones: 1,068

📊 ARCHIVOS GENERADOS:
   Parquet: ./spark_output/ (4 archivos)
   CSV + S3: s3://xideralaws-curso-fernanda/disney-project/final/ (4 archivos)
   Pickle: datos_fase3.pkl

🚀 SIGUIENTE PASO:
   Ejecutar: streamlit run dashboard_disney.py

✅ Spark Session cerrada
✅ Notebook 03b completado al 100%
