In [2]:
# Fase 2: Procesamiento y Limpieza (Bronze ‚Üí Silver)
# Pipeline simplificado: Kafka ‚Üí Limpieza ‚Üí Delta Lake

# ============================================================================
# CONFIGURACI√ìN INICIAL
# ============================================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Crear SparkSession con soporte para Delta Lake y Kafka
spark = SparkSession.builder \
    .appName("Bronze_to_Silver_Pipeline") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,io.delta:delta-core_2.12:2.3.0") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print("SparkSession creada con soporte para Delta Lake y Kafka")
print(f"Spark Version: {spark.version}")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d5c037f6-a9e6-4c35-9da2-61f43d739a2f;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.0 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#common

SparkSession creada con soporte para Delta Lake y Kafka
Spark Version: 3.5.1


25/12/04 23:21:57 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
# ============================================================================
# 1. LECTURA DE DATOS DESDE KAFKA (BRONZE LAYER)
# ============================================================================

print("\n" + "="*80)
print("PASO 1: LECTURA DE DATOS DESDE KAFKA")
print("="*80 + "\n")

# Configuraci√≥n de Kafka
KAFKA_BOOTSTRAP_SERVERS = "kafka:29092"
KAFKA_TOPIC = "contratos-publicos"

# Esquema del JSON de contratos
contract_schema = StructType([
    StructField("id_contrato", StringType(), True),
    StructField("objeto_contrato", StringType(), True),
    StructField("entidad", StringType(), True),
    StructField("departamento", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("region", StringType(), True),
    StructField("codigo_unspsc", StringType(), True),
    StructField("descripcion_categoria", StringType(), True),
    StructField("valor_contrato", DoubleType(), True),
    StructField("duracion_dias", IntegerType(), True),
    StructField("fecha_firma", StringType(), True),
    StructField("tipo_contrato", StringType(), True),
    StructField("estado_contrato", StringType(), True),
    StructField("modalidad", StringType(), True),
    StructField("anno", IntegerType(), True),
    StructField("id_interno_sistema", StringType(), True),
    StructField("campo_vacio", StringType(), True),
    StructField("constante_1", StringType(), True),
    StructField("constante_2", IntegerType(), True),
    StructField("duplicate_id", StringType(), True),
    StructField("timestamp_carga", StringType(), True)
])

# Leer desde Kafka (modo batch)
print(f" Leyendo datos desde Kafka: {KAFKA_TOPIC}")

df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "earliest") \
    .load()

print(f" Mensajes le√≠dos desde Kafka: {df_kafka.count():,}")

# Extraer metadatos de Kafka y parsear JSON
df_bronze = df_kafka.select(
    col("key").cast("string").alias("kafka_key"),
    col("topic").alias("kafka_topic"),
    col("partition").alias("kafka_partition"),
    col("offset").alias("kafka_offset"),
    col("timestamp").alias("kafka_timestamp"),
    from_json(col("value").cast("string"), contract_schema).alias("data")
).select(
    "kafka_topic",
    "kafka_partition", 
    "kafka_offset",
    "kafka_timestamp",
    "data.*"
)

print("\n Esquema de datos Bronze:")
df_bronze.printSchema()

print("\n Muestra de datos:")
df_bronze.select("id_contrato", "entidad", "valor_contrato", "fecha_firma").show(5, truncate=False)



PASO 1: LECTURA DE DATOS DESDE KAFKA

 Leyendo datos desde Kafka: contratos-publicos


25/12/04 23:25:04 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/04 23:25:04 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/04 23:25:04 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/04 23:25:04 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/04 23:25:04 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.
                                                                                

 Mensajes le√≠dos desde Kafka: 50,349

 Esquema de datos Bronze:
root
 |-- kafka_topic: string (nullable = true)
 |-- kafka_partition: integer (nullable = true)
 |-- kafka_offset: long (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)
 |-- id_contrato: string (nullable = true)
 |-- objeto_contrato: string (nullable = true)
 |-- entidad: string (nullable = true)
 |-- departamento: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- region: string (nullable = true)
 |-- codigo_unspsc: string (nullable = true)
 |-- descripcion_categoria: string (nullable = true)
 |-- valor_contrato: double (nullable = true)
 |-- duracion_dias: integer (nullable = true)
 |-- fecha_firma: string (nullable = true)
 |-- tipo_contrato: string (nullable = true)
 |-- estado_contrato: string (nullable = true)
 |-- modalidad: string (nullable = true)
 |-- anno: integer (nullable = true)
 |-- id_interno_sistema: string (nullable = true)
 |-- campo_vacio: string (nullable = true)

25/12/04 23:25:13 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/04 23:25:13 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/04 23:25:13 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/04 23:25:13 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/04 23:25:13 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.


+------------+----------------------------------------------------+--------------+-----------+
|id_contrato |entidad                                             |valor_contrato|fecha_firma|
+------------+----------------------------------------------------+--------------+-----------+
|CPS-045-2024|empresa social del estado regi√≥n de salud soacha.   |7.8624E7      |2024-01-01 |
|CPS 018-2024|E.S.E HOSPITAL NUESTRA SE√ëORA DEL CARMEN DEL COLEGIO|1.07844E7     |2024-01-01 |
|CPS 012-2024|E.S.E HOSPITAL NUESTRA SE√ëORA DEL CARMEN DEL COLEGIO|1.07844E7     |2024-01-01 |
|024-2024    |ESE HOSPITAL SALAZAR DE VILLETA                     |9363575.0     |2024-01-01 |
|CPS-060-2024|empresa social del estado regi√≥n de salud soacha.   |8.3279308E7   |2024-01-01 |
+------------+----------------------------------------------------+--------------+-----------+
only showing top 5 rows



In [5]:
# ============================================================================
# 2. IDENTIFICACI√ìN Y ELIMINACI√ìN DE COLUMNAS REDUNDANTES
# ============================================================================

print("\n" + "="*80)
print("PASO 2: ELIMINACI√ìN DE COLUMNAS REDUNDANTES")
print("="*80 + "\n")

print(" An√°lisis de columnas redundantes:\n")

# Verificar columnas constantes
print(" Valores √∫nicos en columnas candidatas a eliminar:")
df_bronze.select("campo_vacio", "constante_1", "constante_2").distinct().show()

print("\n Verificar duplicados:")
print(f"   - id_contrato √∫nicos: {df_bronze.select('id_contrato').distinct().count():,}")
print(f"   - duplicate_id √∫nicos: {df_bronze.select('duplicate_id').distinct().count():,}")

# Variables redundantes identificadas:
# 1. id_interno_sistema: ID generado internamente, no aporta valor al modelo
# 2. campo_vacio: Siempre NULL
# 3. constante_1: Siempre "VALOR_FIJO"
# 4. constante_2: Siempre 100
# 5. duplicate_id: Duplicado de id_contrato
# 6. timestamp_carga: Solo para auditor√≠a del proceso de carga
# 7. kafka_*: Metadatos del mensaje, no del contrato

redundant_columns = [
    "id_interno_sistema",
    "campo_vacio", 
    "constante_1",
    "constante_2",
    "duplicate_id",
    "timestamp_carga",
    "kafka_topic",
    "kafka_partition",
    "kafka_offset",
    "kafka_timestamp",
    "kafka_key"
]

print("\n Columnas a eliminar:")
for i, col_name in enumerate(redundant_columns, 1):
    print(f"   {i}. {col_name}")

# Eliminar columnas redundantes
df_cleaned = df_bronze.drop(*redundant_columns)

print(f"\n Columnas eliminadas: {len(redundant_columns)}")
print(f" Columnas originales: {len(df_bronze.columns)}")
print(f" Columnas restantes: {len(df_cleaned.columns)}")

print(f"\n Columnas finales:")
for i, col_name in enumerate(df_cleaned.columns, 1):
    print(f"   {i}. {col_name}")


PASO 2: ELIMINACI√ìN DE COLUMNAS REDUNDANTES

 An√°lisis de columnas redundantes:

 Valores √∫nicos en columnas candidatas a eliminar:


25/12/04 23:25:19 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/04 23:25:19 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/04 23:25:19 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/04 23:25:19 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/04 23:25:19 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.
25/12/04 23:25:23 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/04 23:25:23 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/04 23:25:23 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/04 23:25:23 WARN AdminClientConfig: The c

+-----------+-----------+-----------+
|campo_vacio|constante_1|constante_2|
+-----------+-----------+-----------+
|       NULL| VALOR_FIJO|        100|
+-----------+-----------+-----------+


 Verificar duplicados:


25/12/04 23:25:25 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/04 23:25:25 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/04 23:25:25 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/04 23:25:25 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/04 23:25:25 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.


   - id_contrato √∫nicos: 31,509


[Stage 13:>                                                         (0 + 1) / 1]

   - duplicate_id √∫nicos: 31,509

 Columnas a eliminar:
   1. id_interno_sistema
   2. campo_vacio
   3. constante_1
   4. constante_2
   5. duplicate_id
   6. timestamp_carga
   7. kafka_topic
   8. kafka_partition
   9. kafka_offset
   10. kafka_timestamp
   11. kafka_key

 Columnas eliminadas: 11
 Columnas originales: 25
 Columnas restantes: 15

 Columnas finales:
   1. id_contrato
   2. objeto_contrato
   3. entidad
   4. departamento
   5. municipio
   6. region
   7. codigo_unspsc
   8. descripcion_categoria
   9. valor_contrato
   10. duracion_dias
   11. fecha_firma
   12. tipo_contrato
   13. estado_contrato
   14. modalidad
   15. anno


                                                                                

In [7]:
# ============================================================================
# 3. LIMPIEZA Y VALIDACI√ìN DE DATOS
# ============================================================================

print("\n" + "="*80)
print("PASO 3: LIMPIEZA Y VALIDACI√ìN")
print("="*80 + "\n")

# Analizar valores nulos antes de limpiar
print(" An√°lisis de valores nulos:\n")
null_analysis = []
for col_name in df_cleaned.columns:
    null_count = df_cleaned.filter(col(col_name).isNull()).count()
    null_pct = (null_count / df_cleaned.count()) * 100
    null_analysis.append((col_name, null_count, null_pct))

null_df = spark.createDataFrame(null_analysis, ["columna", "nulos", "porcentaje"])
null_df.orderBy(desc("nulos")).show(20, truncate=False)

# Aplicar filtros de limpieza
print("\n Aplicando reglas de limpieza:")
print("   1. id_contrato no nulo")
print("   2. objeto_contrato no nulo")
print("   3. valor_contrato no nulo y mayor a 0")
print("   4. fecha_firma no nula")

df_silver = df_cleaned \
    .filter(col("id_contrato").isNotNull()) \
    .filter(col("objeto_contrato").isNotNull()) \
    .filter(col("valor_contrato").isNotNull()) \
    .filter(col("valor_contrato") > 0) \
    .filter(col("fecha_firma").isNotNull())

# Convertir fecha_firma a tipo Date
df_silver = df_silver.withColumn(
    "fecha_firma",
    to_date(col("fecha_firma"), "yyyy-MM-dd")
)

print(f"\n Resultado de limpieza:")
print(f"   Registros Bronze: {df_cleaned.count():,}")
print(f"   Registros Silver: {df_silver.count():,}")
print(f"   Descartados: {df_cleaned.count() - df_silver.count():,}")


PASO 3: LIMPIEZA Y VALIDACI√ìN

 An√°lisis de valores nulos:



25/12/05 00:30:56 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/05 00:30:56 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/05 00:30:56 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/05 00:30:56 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/05 00:30:56 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.
25/12/05 00:31:03 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/05 00:31:03 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/05 00:31:03 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/05 00:31:03 WARN AdminClientConfig: The c

+---------------------+-----+----------+
|columna              |nulos|porcentaje|
+---------------------+-----+----------+
|duracion_dias        |50349|100.0     |
|tipo_contrato        |0    |0.0       |
|modalidad            |0    |0.0       |
|codigo_unspsc        |0    |0.0       |
|anno                 |0    |0.0       |
|estado_contrato      |0    |0.0       |
|objeto_contrato      |0    |0.0       |
|descripcion_categoria|0    |0.0       |
|region               |0    |0.0       |
|entidad              |0    |0.0       |
|valor_contrato       |0    |0.0       |
|fecha_firma          |0    |0.0       |
|id_contrato          |0    |0.0       |
|departamento         |0    |0.0       |
|municipio            |0    |0.0       |
+---------------------+-----+----------+


 Aplicando reglas de limpieza:
   1. id_contrato no nulo
   2. objeto_contrato no nulo
   3. valor_contrato no nulo y mayor a 0
   4. fecha_firma no nula

 Resultado de limpieza:


25/12/05 00:34:57 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/05 00:34:57 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/05 00:34:57 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/05 00:34:57 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/05 00:34:57 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.
                                                                                

   Registros Bronze: 50,349


25/12/05 00:36:16 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/05 00:36:16 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/05 00:36:16 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/05 00:36:16 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/05 00:36:16 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.
25/12/05 00:36:24 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/05 00:36:24 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/05 00:36:24 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/05 00:36:24 WARN AdminClientConfig: The c

   Registros Silver: 50,058


25/12/05 00:36:25 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/12/05 00:36:25 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/12/05 00:36:25 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/12/05 00:36:25 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/12/05 00:36:25 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.
[Stage 210:>                                                        (0 + 1) / 1]

   Descartados: 291


                                                                                

In [None]:
# ============================================================================
# 4. ESTAD√çSTICAS DESCRIPTIVAS
# ============================================================================

print("\n" + "="*80)
print("PASO 4: ESTAD√çSTICAS DESCRIPTIVAS")
print("="*80 + "\n")

print(" Distribuci√≥n por Regi√≥n:")
df_silver.groupBy("region") \
    .agg(
        count("*").alias("total_contratos"),
        round(sum("valor_contrato"), 2).alias("valor_total"),
        round(avg("valor_contrato"), 2).alias("valor_promedio")
    ) \
    .orderBy(desc("total_contratos")) \
    .show()

print("\n Top 10 Entidades por N√∫mero de Contratos:")
df_silver.groupBy("entidad") \
    .count() \
    .orderBy(desc("count")) \
    .limit(10) \
    .show(truncate=False)

print("\n Top 10 Categor√≠as (UNSPSC) m√°s frecuentes:")
df_silver.groupBy("codigo_unspsc", "descripcion_categoria") \
    .count() \
    .orderBy(desc("count")) \
    .limit(10) \
    .show(truncate=False)

print("\n Estad√≠sticas de Valor de Contratos:")
df_silver.select("valor_contrato").describe().show()

print("\nEstad√≠sticas de Duraci√≥n (d√≠as):")
df_silver.select("duracion_dias") \
    .filter(col("duracion_dias").isNotNull()) \
    .describe() \
    .show()

print("\n Distribuci√≥n por Modalidad de Contrataci√≥n:")
df_silver.groupBy("modalidad") \
    .count() \
    .orderBy(desc("count")) \
    .show(truncate=False)


In [None]:
# ============================================================================
# 5. PERSISTENCIA EN DELTA LAKE (SILVER LAYER)
# ============================================================================

print("\n" + "="*80)
print("PASO 5: PERSISTENCIA EN DELTA LAKE")
print("="*80 + "\n")

# Ruta para guardar la tabla Delta
DELTA_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f" Guardando datos en Delta Lake: {DELTA_PATH}")

# Escribir en formato Delta
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(DELTA_PATH)

print(" Datos guardados en Delta Lake (Silver Layer)")

# Verificar la tabla Delta
print("\n Verificando tabla Delta...")
df_verify = spark.read.format("delta").load(DELTA_PATH)
print(f"   Registros verificados: {df_verify.count():,}")
print(f"   Particiones: {df_verify.rdd.getNumPartitions()}")

In [None]:
# ============================================================================
# 6. CREAR VISTA SQL Y CONSULTAS DE EJEMPLO
# ============================================================================

print("\n" + "="*80)
print("PASO 6: CREAR VISTA SQL")
print("="*80 + "\n")

# Crear vista temporal
df_silver.createOrReplaceTempView("silver_contracts")
print(" Vista SQL 'silver_contracts' creada")

# Consultas SQL de ejemplo
print("\n Consulta SQL - Top 5 contratos m√°s costosos:\n")
spark.sql("""
    SELECT 
        id_contrato,
        entidad,
        valor_contrato,
        FORMAT_NUMBER(valor_contrato, 0) as valor_formateado,
        fecha_firma,
        LEFT(objeto_contrato, 80) as objeto_resumen
    FROM silver_contracts
    ORDER BY valor_contrato DESC
    LIMIT 5
""").show(truncate=False)

print("\n Consulta SQL - Resumen por mes:\n")
spark.sql("""
    SELECT 
        YEAR(fecha_firma) as a√±o,
        MONTH(fecha_firma) as mes,
        COUNT(*) as total_contratos,
        FORMAT_NUMBER(SUM(valor_contrato), 0) as valor_total,
        FORMAT_NUMBER(AVG(valor_contrato), 0) as valor_promedio
    FROM silver_contracts
    GROUP BY YEAR(fecha_firma), MONTH(fecha_firma)
    ORDER BY a√±o, mes
""").show()

In [None]:
# ============================================================================
# 7. REPORTE FINAL
# ============================================================================

print("\n" + "="*80)
print("‚úÖ FASE 2 COMPLETADA - REPORTE FINAL")
print("="*80 + "\n")

# Obtener estad√≠sticas finales
total_contratos = df_silver.count()
total_valor = df_silver.agg(sum("valor_contrato")).collect()[0][0]
valor_promedio = df_silver.agg(avg("valor_contrato")).collect()[0][0]
fecha_min, fecha_max = df_silver.agg(min("fecha_firma"), max("fecha_firma")).collect()[0]

print("üìä RESUMEN DEL PIPELINE:\n")
print(f"   ‚úÖ Total de contratos procesados: {total_contratos:,}")
print(f"   ‚úÖ Columnas redundantes eliminadas: {len(redundant_columns)}")
print(f"   ‚úÖ Valor total de contratos: ${total_valor:,.2f}")
print(f"   ‚úÖ Valor promedio por contrato: ${valor_promedio:,.2f}")
print(f"   ‚úÖ Rango de fechas: {fecha_min} a {fecha_max}")
print(f"   ‚úÖ Departamento: {df_silver.select('departamento').distinct().first()[0]}")
print(f"   ‚úÖ Regi√≥n: {df_silver.select('region').distinct().first()[0]}")

print("\nüìÅ UBICACI√ìN DE DATOS:")
print(f"   Delta Lake: {DELTA_PATH}")
print(f"   Vista SQL: silver_contracts")

print("\nüéØ SIGUIENTE PASO:")
print("   Fase 3: Feature Engineering y Embeddings")
print("   - Tokenizaci√≥n y limpieza de texto")
print("   - Generaci√≥n de embeddings con Word2Vec")
print("   - Codificaci√≥n de variables categ√≥ricas")
print("   - Reducci√≥n de dimensionalidad con PCA")

print("\n" + "="*80 + "\n")