In [1]:
# ============================================================================
# FASE 2 - OPTIMIZADO PARA SPARK 3.5.1 + DELTA LAKE 3.0
# ============================================================================

# PASO 0: REINICIAR SPARK CON VERSIONES CORRECTAS
try:
    spark.stop()
except:
    pass

import time
time.sleep(3)

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, lower, regexp_replace, translate, length, trim

from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, Word2Vec, 
    StringIndexer, OneHotEncoder, VectorAssembler,
    StandardScaler, PCA
)
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
import numpy as np

spark = (
    SparkSession.builder
    .appName("Bronze_to_Silver_Optimized")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "io.delta:delta-spark_2.12:3.0.0")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "50")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print(f" Spark {spark.version} iniciado\n")


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9f7ab7fa-b8ae-4a61-84d1-b919d69880a2;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found io.delta#delta-spark_

 Spark 3.5.1 iniciado



In [2]:
# ============================================================================
# 1. LECTURA DE KAFKA
# ============================================================================
print("="*80)
print("PASO 1: LECTURA DE KAFKA")
print("="*80 + "\n")

# Schema actualizado - SIN municipio, coincide con el producer
contract_schema = StructType([
    StructField("id_contrato", StringType()),
    StructField("objeto_contrato", StringType()),
    StructField("entidad", StringType()),
    StructField("departamento", StringType()),
    StructField("region", StringType()),  # SIN municipio
    StructField("codigo_unspsc", StringType()),
    StructField("descripcion_categoria", StringType()),
    StructField("valor_contrato", DoubleType()),
    StructField("duracion_dias", IntegerType()),
    StructField("fecha_firma", StringType()),
    StructField("tipo_contrato", StringType()),
    StructField("estado_contrato", StringType()),
    StructField("modalidad", StringType()),
    StructField("anno", IntegerType()),
    StructField("id_interno_sistema", StringType()),
    StructField("campo_vacio", StringType()),
    StructField("constante_1", StringType()),
    StructField("constante_2", IntegerType()),
    StructField("duplicate_id", StringType()),
    StructField("timestamp_carga", StringType())
])

print("Conectando a Kafka...")
print("Broker: kafka:29092")
print("T√≥pico: contratos-publicos")
print("Modo: Batch (startingOffsets=earliest)\n")

try:
    df_kafka = spark.read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "contratos-publicos") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", "false") \
        .load()
    
    print("‚úÖ Conexi√≥n exitosa a Kafka\n")
    
    # Parsear JSON desde Kafka
    print("Parseando mensajes JSON...")
    df_bronze = df_kafka.select(
        from_json(col("value").cast("string"), contract_schema).alias("data")
    ).select("data.*")
    
    # Cache para evitar m√∫ltiples lecturas
    df_bronze = df_bronze.cache()
    
    # Contar mensajes
    print("Contando mensajes...")
    total_kafka = df_bronze.count()
    
    print("\n" + "="*80)
    print(f" LECTURA COMPLETADA")
    print("="*80)
    print(f" Total de mensajes le√≠dos: {total_kafka:,}")
    print(f" Campos en el schema: {len(contract_schema.fields)}")
    print("="*80 + "\n")
    
    # Mostrar muestra de datos
    if total_kafka > 0:
        print("Muestra de los primeros 3 registros:")
        df_bronze.show(3, truncate=True)
        print()
    else:
        print(" ADVERTENCIA: No se encontraron mensajes en el t√≥pico")
        print("Verifica que el producer haya enviado datos correctamente\n")

except Exception as e:
    print("\n" + "="*80)
    print(" ERROR EN LECTURA DE KAFKA")
    print("="*80)
    print(f"Error: {str(e)}\n")
    raise

PASO 1: LECTURA DE KAFKA

Conectando a Kafka...
Broker: kafka:29092
T√≥pico: contratos-publicos
Modo: Batch (startingOffsets=earliest)

‚úÖ Conexi√≥n exitosa a Kafka

Parseando mensajes JSON...
Contando mensajes...


                                                                                


 LECTURA COMPLETADA
 Total de mensajes le√≠dos: 100,946
 Campos en el schema: 20

Muestra de los primeros 3 registros:
+------------+--------------------+--------------------+------------+--------------+-------------+---------------------+--------------+-------------+-----------+--------------------+---------------+--------------------+----+------------------+-----------+-----------+-----------+------------+--------------------+
| id_contrato|     objeto_contrato|             entidad|departamento|        region|codigo_unspsc|descripcion_categoria|valor_contrato|duracion_dias|fecha_firma|       tipo_contrato|estado_contrato|           modalidad|anno|id_interno_sistema|campo_vacio|constante_1|constante_2|duplicate_id|     timestamp_carga|
+------------+--------------------+--------------------+------------+--------------+-------------+---------------------+--------------+-------------+-----------+--------------------+---------------+--------------------+----+------------------+---------

In [3]:
# ============================================================================
# 2. ELIMINAR REDUNDANTES Y PREPARAR DATOS
# ============================================================================
print("="*80)
print("PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS")
print("="*80 + "\n")

# Columnas redundantes a eliminar
redundant_columns = [
    "id_interno_sistema", 
    "campo_vacio", 
    "constante_1",
    "constante_2", 
    "duplicate_id", 
    "timestamp_carga"
]

print(f"üìå Eliminando {len(redundant_columns)} columnas redundantes...")
df_cleaned = df_bronze.drop(*redundant_columns)

print(f"‚úÖ Columnas restantes: {len(df_cleaned.columns)}")
print()

# CR√çTICO: Convertir fecha_firma ANTES de hacer cualquier operaci√≥n
print("üîÑ Preparando campo fecha_firma...")
print("   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)")
print("   Convirtiendo a: date (2024-01-04)")

df_cleaned = df_cleaned.withColumn(
    "fecha_firma_temp",
    to_timestamp(col("fecha_firma"))  # Parsea el timestamp ISO
).withColumn(
    "fecha_firma",
    to_date(col("fecha_firma_temp"))  # Extrae solo la fecha
).drop("fecha_firma_temp")

print("‚úÖ Fecha convertida correctamente\n")

# Liberar bronze ahora que ya no lo necesitamos
print("üßπ Liberando memoria de df_bronze...")
df_bronze.unpersist()
print("‚úÖ Memoria liberada\n")

print("="*80)
print(f"üìä Dataset preparado: {len(df_cleaned.columns)} columnas")
print("="*80 + "\n")

PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS

üìå Eliminando 6 columnas redundantes...
‚úÖ Columnas restantes: 14

üîÑ Preparando campo fecha_firma...
   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)
   Convirtiendo a: date (2024-01-04)
‚úÖ Fecha convertida correctamente

üßπ Liberando memoria de df_bronze...
‚úÖ Memoria liberada

üìä Dataset preparado: 14 columnas



In [4]:
# ============================================================================
# CELDA 1: PREPARACI√ìN Y CONTEO INICIAL
# ============================================================================
print("="*80)
print("PASO 3: LIMPIEZA - PREPARACI√ìN")
print("="*80 + "\n")

print("Cacheando datos para an√°lisis...")
df_cleaned = df_cleaned.cache()
total_cleaned = df_cleaned.count()

print(f"‚úÖ Registros totales: {total_cleaned:,}\n")
print(f"üìã Columnas: {len(df_cleaned.columns)}")
print(f"üíæ Datos cacheados en memoria\n")

PASO 3: LIMPIEZA - PREPARACI√ìN

Cacheando datos para an√°lisis...


[Stage 5:>                                                          (0 + 1) / 1]

‚úÖ Registros totales: 100,946

üìã Columnas: 14
üíæ Datos cacheados en memoria



                                                                                

In [5]:
# ============================================================================
# CELDA 2: AN√ÅLISIS DE NULOS (OPTIMIZADO)
# ============================================================================
print("="*80)
print("AN√ÅLISIS DE CALIDAD DE DATOS")
print("="*80 + "\n")

print("Analizando valores nulos en columnas cr√≠ticas...")

# Solo analizar columnas cr√≠ticas para ahorrar memoria
critical_columns = [
    "id_contrato",
    "objeto_contrato", 
    "valor_contrato",
    "fecha_firma",
    "entidad",
    "departamento",
    "duracion_dias"
]

# An√°lisis optimizado solo de columnas cr√≠ticas
null_analysis = df_cleaned.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in critical_columns if c in df_cleaned.columns
]).collect()[0].asDict()

print("üìä Valores nulos en columnas cr√≠ticas:\n")
has_nulls = False
for col_name in critical_columns:
    if col_name in null_analysis:
        null_count = null_analysis[col_name]
        if null_count > 0:
            has_nulls = True
            pct = (null_count / total_cleaned) * 100
            print(f"   ‚ö†Ô∏è  {col_name}: {null_count:,} ({pct:.1f}%)")

if not has_nulls:
    print("   ‚úÖ No hay valores nulos en columnas cr√≠ticas")

print()

AN√ÅLISIS DE CALIDAD DE DATOS

Analizando valores nulos en columnas cr√≠ticas...
üìä Valores nulos en columnas cr√≠ticas:

   ‚ö†Ô∏è  fecha_firma: 695 (0.7%)
   ‚ö†Ô∏è  duracion_dias: 50,350 (49.9%)



In [6]:
# ============================================================================
# CELDA 3: APLICAR FILTROS DE LIMPIEZA
# ============================================================================
print("="*80)
print("APLICANDO FILTROS DE CALIDAD")
print("="*80 + "\n")

print("Aplicando reglas de limpieza:")
print("  ‚úì id_contrato no nulo")
print("  ‚úì objeto_contrato no nulo")
print("  ‚úì valor_contrato no nulo y > 0")
print("  ‚úì fecha_firma no nula")
print()

# Aplicar filtros paso a paso
# NOTA: fecha_firma ya fue convertida a date en el Paso 2
df_silver = df_cleaned \
    .filter(col("id_contrato").isNotNull()) \
    .filter(col("objeto_contrato").isNotNull()) \
    .filter(col("valor_contrato").isNotNull()) \
    .filter(col("valor_contrato") > 0) \
    .filter(col("fecha_firma").isNotNull())

print("‚úÖ Filtros aplicados correctamente\n")

APLICANDO FILTROS DE CALIDAD

Aplicando reglas de limpieza:
  ‚úì id_contrato no nulo
  ‚úì objeto_contrato no nulo
  ‚úì valor_contrato no nulo y > 0
  ‚úì fecha_firma no nula

‚úÖ Filtros aplicados correctamente



In [7]:
# ============================================================================
# CELDA 4: CACHEAR RESULTADOS Y GENERAR REPORTE
# ============================================================================
print("="*80)
print("FINALIZANDO LIMPIEZA")
print("="*80 + "\n")

print("Cacheando datos limpios...")
df_silver = df_silver.cache()
total_silver = df_silver.count()

# Calcular estad√≠sticas
registros_descartados = total_cleaned - total_silver
pct_retenido = (total_silver / total_cleaned) * 100 if total_cleaned > 0 else 0
pct_descartado = (registros_descartados / total_cleaned) * 100 if total_cleaned > 0 else 0

print("\n" + "="*80)
print("üìä RESUMEN DE LIMPIEZA")
print("="*80)
print(f"  Registros iniciales:    {total_cleaned:,}")
print(f"  Registros finales:      {total_silver:,} ({pct_retenido:.1f}%)")
print(f"  Registros descartados:  {registros_descartados:,} ({pct_descartado:.1f}%)")
print("="*80 + "\n")

# Liberar memoria del DataFrame anterior
print("Liberando memoria del cache anterior...")
df_cleaned.unpersist()
print("‚úÖ Limpieza completada\n")

FINALIZANDO LIMPIEZA

Cacheando datos limpios...


[Stage 12:>                                                         (0 + 1) / 1]


üìä RESUMEN DE LIMPIEZA
  Registros iniciales:    100,946
  Registros finales:      99,706 (98.8%)
  Registros descartados:  1,240 (1.2%)

Liberando memoria del cache anterior...
‚úÖ Limpieza completada



                                                                                

In [8]:
# ============================================================================
# 4. ESTAD√çSTICAS
# ============================================================================

print("="*80)
print("PASO 4: ESTAD√çSTICAS")
print("="*80 + "\n")

print("üìä Por regi√≥n:")
df_silver.groupBy("codigo_unspsc").count().orderBy(desc("count")).show(5)

print("\nüìä Top 5 entidades:")
df_silver.groupBy("entidad").count().orderBy(desc("count")).show(5, truncate=False)


PASO 4: ESTAD√çSTICAS

üìä Por regi√≥n:
+-------------+-----+
|codigo_unspsc|count|
+-------------+-----+
|             |50058|
|  V1.80111600|11418|
|  V1.80111701| 4341|
|  V1.85101600| 2605|
|  V1.80111620| 2202|
+-------------+-----+
only showing top 5 rows


üìä Top 5 entidades:
+-------------------------------------------------+-----+
|entidad                                          |count|
+-------------------------------------------------+-----+
|MUNICIPIO DE SOACHA.                             |6354 |
|ALCALD√çA MUNICIPAL COTA                          |3987 |
|ESE MUNICIPAL DE SOACHA JULIO CESAR PE√ëALOZA*    |3822 |
|CUNDINAMARCA-ALCALDIA MUNICIPIO MOSQUERA         |3764 |
|empresa social del estado regi√≥n de salud soacha.|3152 |
+-------------------------------------------------+-----+
only showing top 5 rows



In [10]:
# 1. Entidades
print("üìä Top 5 entidades:")
df_silver.groupBy("entidad").count().orderBy(desc("count")).show(5, truncate=False)

# 2. Departamentos
print("\nüìä Top 5 departamentos:")
df_silver.groupBy("departamento").count().orderBy(desc("count")).show(5, truncate=False)

# 4. Regi√≥n
print("\nüìä Distribuci√≥n por regi√≥n:")
df_silver.groupBy("region").count().orderBy(desc("count")).show(truncate=False)

# 5. C√≥digo UNSPSC
print("\nüìä Top 10 c√≥digos UNSPSC:")
df_silver.groupBy("codigo_unspsc").count().orderBy(desc("count")).show(10, truncate=False)

# 6. Categor√≠a UNSPSC
print("\nüìä Top 10 categor√≠as UNSPSC:")
df_silver.groupBy("descripcion_categoria").count().orderBy(desc("count")).show(10, truncate=False)

# 7. Tipo de contrato
print("\nüìä Distribuci√≥n por tipo de contrato:")
df_silver.groupBy("tipo_contrato").count().orderBy(desc("count")).show(truncate=False)

# 8. Estado del contrato
print("\nüìä Distribuci√≥n de estado del contrato:")
df_silver.groupBy("estado_contrato").count().orderBy(desc("count")).show(truncate=False)

# 9. Modalidad de contrataci√≥n
print("\nüìä Top 10 modalidades:")
df_silver.groupBy("modalidad").count().orderBy(desc("count")).show(10, truncate=False)


üìä Top 5 entidades:
+-------------------------------------------------+-----+
|entidad                                          |count|
+-------------------------------------------------+-----+
|MUNICIPIO DE SOACHA.                             |6354 |
|ALCALD√çA MUNICIPAL COTA                          |3987 |
|ESE MUNICIPAL DE SOACHA JULIO CESAR PE√ëALOZA*    |3822 |
|CUNDINAMARCA-ALCALDIA MUNICIPIO MOSQUERA         |3764 |
|empresa social del estado regi√≥n de salud soacha.|3152 |
+-------------------------------------------------+-----+
only showing top 5 rows


üìä Top 5 departamentos:
+------------+-----+
|departamento|count|
+------------+-----+
|Cundinamarca|99706|
+------------+-----+


üìä Distribuci√≥n por regi√≥n:
+--------------+-----+
|region        |count|
+--------------+-----+
|Centro-Oriente|99706|
+--------------+-----+


üìä Top 10 c√≥digos UNSPSC:
+-------------+-----+
|codigo_unspsc|count|
+-------------+-----+
|             |50058|
|V1.80111600  |11418|
|V1.80

In [11]:
from pyspark.sql.functions import min, max, avg, stddev, expr

# 10. Valor del contrato
print("\nüí∞ Estad√≠sticas de valor_contrato:")
df_silver.select(
    min("valor_contrato").alias("min"),
    max("valor_contrato").alias("max"),
    avg("valor_contrato").alias("mean"),
    stddev("valor_contrato").alias("std")
).show()

# Percentiles
print("\nüí∞ Percentiles de valor_contrato:")
df_silver.approxQuantile("valor_contrato", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)

# 11. Duraci√≥n en d√≠as
print("\n‚è±Ô∏è Estad√≠sticas de duracion_dias:")
df_silver.select(
    min("duracion_dias").alias("min"),
    max("duracion_dias").alias("max"),
    avg("duracion_dias").alias("mean"),
    stddev("duracion_dias").alias("std")
).show()

print("\n‚è±Ô∏è Percentiles de duracion_dias:")
df_silver.approxQuantile("duracion_dias", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)



üí∞ Estad√≠sticas de valor_contrato:
+---+----------------+------------------+--------------------+
|min|             max|              mean|                 std|
+---+----------------+------------------+--------------------+
|1.0|1.50838540149E11|9.96561478195896E7|1.1539535981771367E9|
+---+----------------+------------------+--------------------+


üí∞ Percentiles de valor_contrato:

‚è±Ô∏è Estad√≠sticas de duracion_dias:
+---+----+-----------------+------------------+
|min| max|             mean|               std|
+---+----+-----------------+------------------+
|  0|4297|83.20190545249461|101.88598302532633|
+---+----+-----------------+------------------+


‚è±Ô∏è Percentiles de duracion_dias:


[0.0, 6.0, 40.0, 126.0, 4297.0]

In [10]:
print("\nüìÖ Top a√±os:")
df_silver.groupBy("anno").count().orderBy(desc("anno")).show(10, truncate=False)

print("\nüìÖ Contratos por a√±o:")
df_silver.groupBy("anno").count().orderBy(desc("count")).show(10, truncate=False)

print("\nüìÖ Top fechas de firma:")
df_silver.groupBy("fecha_firma").count().orderBy(desc("count")).show(10, truncate=False)



üìÖ Top a√±os:
+----+-----+
|anno|count|
+----+-----+
|2024|50058|
+----+-----+


üìÖ Contratos por a√±o:
+----+-----+
|anno|count|
+----+-----+
|2024|50058|
+----+-----+


üìÖ Top fechas de firma:
+-----------+-----+
|fecha_firma|count|
+-----------+-----+
|2024-03-01 |457  |
|2024-02-01 |450  |
|2024-03-22 |436  |
|2024-02-02 |425  |
|2024-01-31 |415  |
|2024-02-16 |398  |
|2024-02-09 |398  |
|2024-02-06 |363  |
|2024-09-16 |343  |
|2024-02-08 |343  |
+-----------+-----+
only showing top 10 rows



In [12]:

# --------------------------------------------
# TEXTO / OBJETO DEL CONTRATO
# --------------------------------------------

print("\nüìù Longitud promedio del objeto del contrato:")
df_silver.select(avg(expr("length(objeto_contrato)"))).show()


üìù Longitud promedio del objeto del contrato:
+----------------------------+
|avg(length(objeto_contrato))|
+----------------------------+
|          224.29975126873006|
+----------------------------+



In [13]:
# ============================================================================
# 5. GUARDAR EN DELTA LAKE
# ============================================================================

print("="*80)
print("PASO 5: GUARDAR EN DELTA LAKE")
print("="*80 + "\n")

DELTA_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"üíæ Guardando en: {DELTA_PATH}")

df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(DELTA_PATH)

print("‚úÖ Guardado exitosamente\n")

# ‚ö†Ô∏è LIBERAR todo
df_silver.unpersist()
spark.catalog.clearCache()


PASO 5: GUARDAR EN DELTA LAKE

üíæ Guardando en: /app/notebooks/delta_lake/silver_contracts


                                                                                

‚úÖ Guardado exitosamente



In [10]:
print(f"Spark version: {spark.version}")

Spark version: 3.5.1


In [14]:
# ============================================================================
# 6. VERIFICACI√ìN
# ============================================================================

print("="*80)
print("VERIFICACI√ìN FINAL")
print("="*80 + "\n")

df_verify = spark.read.format("delta").load(DELTA_PATH)
print(f"‚úÖ Registros verificados: {df_verify.count():,}")

df_verify.select("id_contrato", "entidad", "valor_contrato", "fecha_firma") \
    .show(5, truncate=False)

print("\nüéØ Fase 2 completada. Siguiente: Fase 3 - Embeddings\n")

VERIFICACI√ìN FINAL

‚úÖ Registros verificados: 99,706
+------------+----------------------------------------------------+--------------+-----------+
|id_contrato |entidad                                             |valor_contrato|fecha_firma|
+------------+----------------------------------------------------+--------------+-----------+
|CPS-045-2024|empresa social del estado regi√≥n de salud soacha.   |7.8624E7      |2024-01-01 |
|CPS 018-2024|E.S.E HOSPITAL NUESTRA SE√ëORA DEL CARMEN DEL COLEGIO|1.07844E7     |2024-01-01 |
|CPS 012-2024|E.S.E HOSPITAL NUESTRA SE√ëORA DEL CARMEN DEL COLEGIO|1.07844E7     |2024-01-01 |
|024-2024    |ESE HOSPITAL SALAZAR DE VILLETA                     |9363575.0     |2024-01-01 |
|CPS-060-2024|empresa social del estado regi√≥n de salud soacha.   |8.3279308E7   |2024-01-01 |
+------------+----------------------------------------------------+--------------+-----------+
only showing top 5 rows


üéØ Fase 2 completada. Siguiente: Fase 3 - Embeddings



In [2]:
# ============================================================================
# 1. CARGAR DATOS DESDE SILVER
# ============================================================================

print("\n" + "="*80)
print("PASO 1: CARGAR DATOS DESDE SILVER")
print("="*80 + "\n")

SILVER_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"üìñ Cargando datos desde: {SILVER_PATH}")

df_silver = spark.read.format("delta").load(SILVER_PATH)
df_silver = df_silver.cache()

total_records = df_silver.count()
print(f"‚úÖ Registros cargados: {total_records:,}\n")

print("üìã Esquema de datos:")
df_silver.printSchema()

print("\nüìä Muestra de datos:")
df_silver.select(
    "id_contrato", "objeto_contrato", "entidad", 
    "codigo_unspsc", "valor_contrato", "duracion_dias"
).show(3, truncate=False)





PASO 1: CARGAR DATOS DESDE SILVER

üìñ Cargando datos desde: /app/notebooks/delta_lake/silver_contracts




‚úÖ Registros cargados: 50,058

üìã Esquema de datos:
root
 |-- id_contrato: string (nullable = true)
 |-- objeto_contrato: string (nullable = true)
 |-- entidad: string (nullable = true)
 |-- departamento: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- region: string (nullable = true)
 |-- codigo_unspsc: string (nullable = true)
 |-- descripcion_categoria: string (nullable = true)
 |-- valor_contrato: double (nullable = true)
 |-- duracion_dias: integer (nullable = true)
 |-- fecha_firma: date (nullable = true)
 |-- tipo_contrato: string (nullable = true)
 |-- estado_contrato: string (nullable = true)
 |-- modalidad: string (nullable = true)
 |-- anno: integer (nullable = true)


üìä Muestra de datos:
+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [3]:
print("\n" + "="*80)
print("PASO 2: LIMPIEZA Y PREPARACI√ìN DE TEXTO (NORMALIZADO)")
print("="*80 + "\n")

print("üßπ Limpiando columna 'objeto_contrato' y eliminando tildes...")

# Definimos los caracteres a reemplazar
# Nota: Inclu√≠ la '√±' -> 'n' y la '√º' -> 'u'. 
# Si quieres CONSERVAR la √±, qu√≠tala de estas cadenas.
src_chars = "√°√©√≠√≥√∫√º√±"
dst_chars = "aeiouun"

df_prepared = df_silver.withColumn(
    "objeto_limpio",
    trim(                                           # 4. Quitar espacios al inicio/final
        regexp_replace(                             # 3. Colapsar espacios m√∫ltiples
            regexp_replace(                         # 2. Eliminar caracteres especiales
                translate(                          # 1. Reemplazar tildes
                    lower(col("objeto_contrato")),  # 0. Convertir a min√∫sculas
                    src_chars, 
                    dst_chars
                ),
                "[^a-z0-9\\s]", " " # Solo deja letras a-z (sin tildes), n√∫meros y espacios
            ),
            "\\s+", " "
        )
    )
)

# Filtrar textos muy cortos
df_prepared = df_prepared.filter(length(col("objeto_limpio")) >= 10)

print(f" Registros despu√©s de limpieza: {df_prepared.count():,}")

print("\n Ejemplo de texto limpio (Sin tildes):")
df_prepared.select("objeto_contrato", "objeto_limpio").show(5, truncate=50)


PASO 2: LIMPIEZA Y PREPARACI√ìN DE TEXTO (NORMALIZADO)

üßπ Limpiando columna 'objeto_contrato' y eliminando tildes...


ERROR:root:Exception while sending command.                         (1 + 1) / 2]
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_comman

Py4JError: An error occurred while calling o72.count

In [4]:
# ============================================================================
# PASO 3.1: DEFINIR STOPWORDS Y TOKENIZAR
# ============================================================================

print("üî§ PASO 3.1: Tokenizaci√≥n y Stopwords")
print("-" * 80 + "\n")

# Stopwords especializados para contratos
stopwords_es = [
    # B√°sicos espa√±ol
    "el", "la", "de", "que", "y", "a", "en", "un", "ser", "se", "no",
    "por", "con", "su", "para", "como", "estar", "tener", "le", "lo",
    "pero", "hacer", "o", "este", "otro", "ese", "si", "ya", "ver",
    "dar", "muy", "sin", "sobre", "tambi√©n", "hasta", "a√±o", "entre",
    "del", "al", "los", "las", "uno", "una", "unos", "unas",
    # Espec√≠ficos de contratos p√∫blicos
    "contrato", "contratos", "objeto", "prestacion", "prestaci√≥n",
    "servicio", "servicios", "suministro", "ejecucion", "ejecuci√≥n",
    "acuerdo", "establecido", "pliego", "condiciones", "especificaciones",
    "entidad", "contratante", "contratista", "plazo", "termino"
]

print(f"   ‚Ä¢ Stopwords definidos: {len(stopwords_es)} (b√°sicos + contratos)")

# Tokenizaci√≥n
tokenizer = Tokenizer(inputCol="objeto_limpio", outputCol="palabras")
df_tokenized = tokenizer.transform(df_prepared)

# Remover stopwords
remover = StopWordsRemover(
    inputCol="palabras", 
    outputCol="palabras_sin_stopwords",
    stopWords=stopwords_es
)
df_filtered = remover.transform(df_tokenized)

print("   ‚úÖ Tokenizaci√≥n y stopwords completados\n")

print("üìã Ejemplo:")
df_filtered.select("objeto_limpio", "palabras_sin_stopwords").show(2, truncate=70)


üî§ PASO 3.1: Tokenizaci√≥n y Stopwords
--------------------------------------------------------------------------------

   ‚Ä¢ Stopwords definidos: 65 (b√°sicos + contratos)
   ‚úÖ Tokenizaci√≥n y stopwords completados

üìã Ejemplo:
+----------------------------------------------------------------------+----------------------------------------------------------------------+
|                                                         objeto_limpio|                                                palabras_sin_stopwords|
+----------------------------------------------------------------------+----------------------------------------------------------------------+
|prestacion de servicios como medico general con experiencia en coor...|[medico, general, experiencia, coordinacion, hospitalizacion, urgen...|
|prestar apoyo al proceso asistencial en el area de farmacia tecnica...|[prestar, apoyo, proceso, asistencial, area, farmacia, tecnica, e, ...|
+------------------------------------------

In [5]:
# ============================================================================
# PASO 3.2: LIMPIEZA Y FILTRADO
# ============================================================================

print("\nüîç PASO 3.2: Limpieza y Filtrado")
print("-" * 80 + "\n")

# Filtrar palabras muy cortas (ruido)
def clean_words(words):
    """Mantener solo palabras >= 3 caracteres"""
    if not words:
        return []
    return [w for w in words if len(w) >= 3]

clean_udf = udf(clean_words, ArrayType(StringType()))

df_filtered = df_filtered.withColumn(
    "palabras_filtradas",
    clean_udf(col("palabras_sin_stopwords"))
)

# Eliminar documentos vac√≠os
df_filtered = df_filtered.filter(size(col("palabras_filtradas")) > 0)

print("   ‚Ä¢ Filtro aplicado: palabras >= 3 caracteres")
print("   ‚Ä¢ Documentos vac√≠os eliminados")
print("   ‚úÖ Limpieza completada\n")




üîç PASO 3.2: Limpieza y Filtrado
--------------------------------------------------------------------------------

   ‚Ä¢ Filtro aplicado: palabras >= 3 caracteres
   ‚Ä¢ Documentos vac√≠os eliminados
   ‚úÖ Limpieza completada



In [7]:
# ============================================================================
# 4. GENERACI√ìN DE EMBEDDINGS CON WORD2VEC
# ============================================================================

print("\n" + "="*80)
print("PASO 4: GENERACI√ìN DE EMBEDDINGS CON WORD2VEC")
print("="*80 + "\n")

print("üî¢ Entrenando modelo Word2Vec...")
print("   - Vector size: 100 dimensiones")
print("   - Min word count: 2 (palabras que aparecen al menos 2 veces)")
print("   - Iterations: 10\n")

# Configurar Word2Vec
word2vec = Word2Vec(
    vectorSize=100,
    minCount=2,
    maxIter=10,
    seed=42,
    inputCol="palabras_filtradas",
    outputCol="embedding_raw"
)

# Entrenar Word2Vec
print("‚è≥ Entrenando (esto puede tardar 1-2 minutos)...")
word2vec_model = word2vec.fit(df_filtered)

# Aplicar el modelo
df_embeddings = word2vec_model.transform(df_filtered)

print("‚úÖ Word2Vec entrenado y aplicado")

# Estad√≠sticas del vocabulario
vocab_size = len(word2vec_model.getVectors().collect())
print(f"\nüìä Tama√±o del vocabulario: {vocab_size:,} palabras √∫nicas")

print("\nüìã Ejemplo de embeddings (primeros 10 valores):")

# SOLUCI√ìN: Crear UDF para convertir Vector a Array
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.linalg import VectorUDT, Vectors

# UDF para convertir Vector de Spark ML a Array
def vector_to_array(vector):
    return vector.toArray().tolist() if vector is not None else None

vector_to_array_udf = udf(vector_to_array, ArrayType(DoubleType()))

# Convertir el embedding a array y mostrar primeros 10 valores
df_embeddings.select(
    "id_contrato",
    slice(vector_to_array_udf(col("embedding_raw")), 1, 10).alias("embedding_muestra")
).show(2, truncate=False)

# Opcional: Ver el tama√±o completo del embedding
print("\nüìè Verificando dimensi√≥n del embedding:")
df_embeddings.select(
    "id_contrato",
    size(vector_to_array_udf(col("embedding_raw"))).alias("dimension_embedding")
).show(5)


PASO 4: GENERACI√ìN DE EMBEDDINGS CON WORD2VEC



ConnectionRefusedError: [Errno 111] Connection refused