## Fase 2

In [3]:
# ============================================================================
# FASE 2 - OPTIMIZADO PARA SPARK 3.5.1 + DELTA LAKE 3.0
# ============================================================================

# PASO 0: REINICIAR SPARK CON VERSIONES CORRECTAS
try:
    spark.stop()
except:
    pass

import time
time.sleep(3)

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, lower, regexp_replace, translate, length, trim

from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, Word2Vec, 
    StringIndexer, OneHotEncoder, VectorAssembler,
    StandardScaler, PCA
)
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
import numpy as np

spark = (
    SparkSession.builder
    .appName("Bronze_to_Silver_Optimized")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "io.delta:delta-spark_2.12:3.0.0")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "50")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print(f" Spark {spark.version} iniciado\n")


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c3c061e6-e049-4b5e-a5d8-7d2b37e59976;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found io.delta#delta-spark_

 Spark 3.5.1 iniciado



In [13]:
# ============================================================================
# 1. LECTURA DE KAFKA
# ============================================================================

print("="*80)
print("PASO 1: LECTURA DE KAFKA")
print("="*80 + "\n")

contract_schema = StructType([
    StructField("id_contrato", StringType()),
    StructField("objeto_contrato", StringType()),
    StructField("entidad", StringType()),
    StructField("departamento", StringType()),
    StructField("municipio", StringType()),
    StructField("region", StringType()),
    StructField("codigo_unspsc", StringType()),
    StructField("descripcion_categoria", StringType()),
    StructField("valor_contrato", DoubleType()),
    StructField("duracion_dias", IntegerType()),
    StructField("fecha_firma", StringType()),
    StructField("tipo_contrato", StringType()),
    StructField("estado_contrato", StringType()),
    StructField("modalidad", StringType()),
    StructField("anno", IntegerType()),
    StructField("id_interno_sistema", StringType()),
    StructField("campo_vacio", StringType()),
    StructField("constante_1", StringType()),
    StructField("constante_2", IntegerType()),
    StructField("duplicate_id", StringType()),
    StructField("timestamp_carga", StringType())
])

print("Leyendo Kafka...")

df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "contratos-publicos") \
    .option("startingOffsets", "earliest") \
    .load()

df_bronze = df_kafka.select(
    from_json(col("value").cast("string"), contract_schema).alias("data")
).select("data.*")

df_bronze = df_bronze.cache()
total_kafka = df_bronze.count()

print(f" Mensajes: {total_kafka:,}\n")



PASO 1: LECTURA DE KAFKA

Leyendo Kafka...


[Stage 52:>                                                         (0 + 1) / 1]

 Mensajes: 100,698



                                                                                

In [14]:
# ============================================================================
# 2. ELIMINAR REDUNDANTES Y PREPARAR DATOS
# ============================================================================
print("="*80)
print("PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS")
print("="*80 + "\n")

# Columnas redundantes a eliminar
redundant_columns = [
    "id_interno_sistema",
    "campo_vacio",
    "constante_1",
    "constante_2",
    "duplicate_id",
    "timestamp_carga"
]

print(f" Eliminando {len(redundant_columns)} columnas redundantes...")
df_cleaned = df_bronze.drop(*redundant_columns)

print(f"Columnas restantes: {len(df_cleaned.columns)}")
print()

print(" Preparando campo fecha_firma...")
print("   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)")
print("   Convirtiendo a: date (2024-01-04)")

df_cleaned = (
    df_cleaned
    .withColumn("fecha_firma_temp", to_timestamp(col("fecha_firma")))
    .withColumn("fecha_firma", to_date(col("fecha_firma_temp")))
    .drop("fecha_firma_temp")
)

print(" Fecha convertida correctamente\n")

# Liberar bronze ahora que ya no lo necesitamos
print(" Liberando memoria de df_bronze...")
df_bronze.unpersist()
print(" Memoria liberada\n")

print("="*80)
print(f" Dataset preparado: {len(df_cleaned.columns)} columnas")
print("="*80 + "\n")


PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS

 Eliminando 6 columnas redundantes...
Columnas restantes: 15

 Preparando campo fecha_firma...
   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)
   Convirtiendo a: date (2024-01-04)
 Fecha convertida correctamente

 Liberando memoria de df_bronze...
 Memoria liberada

 Dataset preparado: 15 columnas



In [15]:
# ============================================================================
# CELDA 1: PREPARACI√ìN Y CONTEO INICIAL
# ============================================================================
print("="*80)
print("PASO 3: LIMPIEZA - PREPARACI√ìN")
print("="*80 + "\n")

print("Cacheando datos para an√°lisis...")
df_cleaned = df_cleaned.cache()
total_cleaned = df_cleaned.count()

print(f" Registros totales: {total_cleaned:,}\n")
print(f" Columnas: {len(df_cleaned.columns)}")
print(f" Datos cacheados¬†en¬†memoria\n")

PASO 3: LIMPIEZA - PREPARACI√ìN

Cacheando datos para an√°lisis...


[Stage 56:>                                                         (0 + 1) / 1]

 Registros totales: 100,698

 Columnas: 15
 Datos cacheados¬†en¬†memoria



                                                                                

In [16]:
# ============================================================================
# CELDA 2: AN√ÅLISIS DE NULOS (OPTIMIZADO)
# ============================================================================
print("="*80)
print("AN√ÅLISIS DE CALIDAD DE DATOS")
print("="*80 + "\n")

print("Analizando valores nulos en columnas cr√≠ticas...")

# Solo analizar columnas cr√≠ticas para ahorrar memoria
critical_columns = [
    "id_contrato",
    "objeto_contrato", 
    "valor_contrato",
    "fecha_firma",
    "entidad",
    "departamento",
    "duracion_dias"
]

# An√°lisis optimizado solo de columnas cr√≠ticas
null_analysis = df_cleaned.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in critical_columns if c in df_cleaned.columns
]).collect()[0].asDict()

print("üìä Valores nulos en columnas cr√≠ticas:\n")
has_nulls = False
for col_name in critical_columns:
    if col_name in null_analysis:
        null_count = null_analysis[col_name]
        if null_count > 0:
            has_nulls = True
            pct = (null_count / total_cleaned) * 100
            print(f"   ‚ö†  {col_name}: {null_count:,} ({pct:.1f}%)")

if not has_nulls:
    print("   ‚úÖ No hay valores nulos en columnas cr√≠ticas")

print()


AN√ÅLISIS DE CALIDAD DE DATOS

Analizando valores nulos en columnas cr√≠ticas...
üìä Valores nulos en columnas cr√≠ticas:

   ‚ö†  fecha_firma: 695 (0.7%)
   ‚ö†  duracion_dias: 50,350 (50.0%)



In [17]:
# ============================================================================
# CELDA 3: APLICAR FILTROS DE LIMPIEZA
# ============================================================================
print("="*80)
print("APLICANDO FILTROS DE CALIDAD")
print("="*80 + "\n")

print("Aplicando reglas de limpieza:")
print("  ‚úì id_contrato no nulo")
print("  ‚úì objeto_contrato no nulo")
print("  ‚úì valor_contrato no nulo y > 0")
print("  ‚úì fecha_firma no nula")
print()

# Aplicar filtros paso a paso
# NOTA: fecha_firma ya fue convertida a date en el Paso 2
df_silver = df_cleaned \
    .filter(col("id_contrato").isNotNull()) \
    .filter(col("objeto_contrato").isNotNull()) \
    .filter(col("valor_contrato").isNotNull()) \
    .filter(col("valor_contrato") > 0) \
    .filter(col("fecha_firma").isNotNull())

print("‚úÖ Filtros aplicados correctamente\n")


APLICANDO FILTROS DE CALIDAD

Aplicando reglas de limpieza:
  ‚úì id_contrato no nulo
  ‚úì objeto_contrato no nulo
  ‚úì valor_contrato no nulo y > 0
  ‚úì fecha_firma no nula

‚úÖ Filtros aplicados correctamente



In [18]:
print(f"Spark version: {spark.version}")

Spark version: 3.5.1


In [19]:
# ============================================================================
# CELDA 4: CACHEAR RESULTADOS Y GENERAR REPORTE
# ============================================================================
print("="*80)
print("FINALIZANDO LIMPIEZA")
print("="*80 + "\n")

print("Cacheando datos limpios...")
df_silver = df_silver.cache()
total_silver = df_silver.count()

# Calcular estad√≠sticas
registros_descartados = total_cleaned - total_silver
pct_retenido = (total_silver / total_cleaned) * 100 if total_cleaned > 0 else 0
pct_descartado = (registros_descartados / total_cleaned) * 100 if total_cleaned > 0 else 0

print("\n" + "="*80)
print("üìä RESUMEN DE LIMPIEZA")
print("="*80)
print(f"  Registros iniciales:    {total_cleaned:,}")
print(f"  Registros finales:      {total_silver:,} ({pct_retenido:.1f}%)")
print(f"  Registros descartados:  {registros_descartados:,} ({pct_descartado:.1f}%)")
print("="*80 + "\n")

# Liberar memoria del DataFrame anterior
print("Liberando memoria del cache anterior...")
df_cleaned.unpersist()
print("‚úÖ Limpieza¬†completada\n")


FINALIZANDO LIMPIEZA

Cacheando datos limpios...

üìä RESUMEN DE LIMPIEZA
  Registros iniciales:    100,698
  Registros finales:      99,458 (98.8%)
  Registros descartados:  1,240 (1.2%)

Liberando memoria del cache anterior...
‚úÖ Limpieza¬†completada



In [20]:
# 1. Entidades
print("Top 5 entidades:")
df_silver.groupBy("entidad").count().orderBy(desc("count")).show(5, truncate=False)

# 2. Departamentos
print("\nTop 5 departamentos:")
df_silver.groupBy("departamento").count().orderBy(desc("count")).show(5, truncate=False)

# 3. Regi√≥n
print("\nDistribuci√≥n por regi√≥n:")
df_silver.groupBy("region").count().orderBy(desc("count")).show(truncate=False)

# 4. C√≥digo UNSPSC
print("\nTop 10 c√≥digos UNSPSC:")
df_silver.groupBy("codigo_unspsc").count().orderBy(desc("count")).show(10, truncate=False)

# 5. Categor√≠a UNSPSC
print("\nTop 10 categor√≠as UNSPSC:")
df_silver.groupBy("descripcion_categoria").count().orderBy(desc("count")).show(10, truncate=False)

# 6. Tipo de contrato
print("\nDistribuci√≥n por tipo de contrato:")
df_silver.groupBy("tipo_contrato").count().orderBy(desc("count")).show(truncate=False)

# 7. Estado del contrato
print("\nDistribuci√≥n del estado del contrato:")
df_silver.groupBy("estado_contrato").count().orderBy(desc("count")).show(truncate=False)

# 8. Modalidad de contrataci√≥n
print("\nTop 10 modalidades de contrataci√≥n:")
df_silver.groupBy("modalidad").count().orderBy(desc("count")).show(10, truncate=False)


Top 5 entidades:
+-------------------------------------------------+-----+
|entidad                                          |count|
+-------------------------------------------------+-----+
|MUNICIPIO DE SOACHA.                             |6356 |
|ALCALD√çA MUNICIPAL COTA                          |3988 |
|ESE MUNICIPAL DE SOACHA JULIO CESAR PE√ëALOZA*    |3822 |
|CUNDINAMARCA-ALCALDIA MUNICIPIO MOSQUERA         |3759 |
|empresa social del estado regi√≥n de salud soacha.|3152 |
+-------------------------------------------------+-----+
only showing top 5 rows


Top 5 departamentos:
+------------+-----+
|departamento|count|
+------------+-----+
|Cundinamarca|99458|
+------------+-----+


Distribuci√≥n por regi√≥n:
+--------------+-----+
|region        |count|
+--------------+-----+
|Centro-Oriente|99458|
+--------------+-----+


Top 10 c√≥digos UNSPSC:
+-------------+-----+
|codigo_unspsc|count|
+-------------+-----+
|             |50058|
|V1.80111600  |11391|
|V1.80111701  |4329 |
|V1.

In [22]:
from pyspark.sql.functions import min, max, avg, stddev, expr

# 10. Valor del contrato
print("\nEstad√≠sticas de valor_contrato:")
df_silver.select(
    min("valor_contrato").alias("min"),
    max("valor_contrato").alias("max"),
    avg("valor_contrato").alias("mean"),
    stddev("valor_contrato").alias("std")
).show()

# Percentiles
print("\nPercentiles de valor_contrato:")
df_silver.approxQuantile("valor_contrato", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)

# 11. Duraci√≥n en d√≠as
print("\nEstad√≠sticas de duracion_dias:")
df_silver.select(
    min("duracion_dias").alias("min"),
    max("duracion_dias").alias("max"),
    avg("duracion_dias").alias("mean"),
    stddev("duracion_dias").alias("std")
).show()

print("\nPercentiles de duracion_dias:")
df_silver.approxQuantile("duracion_dias", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)



Estad√≠sticas de valor_contrato:
+---+----------------+-------------------+--------------------+
|min|             max|               mean|                 std|
+---+----------------+-------------------+--------------------+
|1.0|1.50838540149E11|9.941466321590018E7|1.1521186504414532E9|
+---+----------------+-------------------+--------------------+


Percentiles de valor_contrato:

Estad√≠sticas de duracion_dias:
+---+----+-----------------+------------------+
|min| max|             mean|               std|
+---+----+-----------------+------------------+
|  0|4297|82.47422012591348|101.20091534465666|
+---+----+-----------------+------------------+


Percentiles de duracion_dias:


[0.0, 6.0, 40.0, 125.0, 4297.0]

In [24]:
print("\nTop a√±os:")
df_silver.groupBy("anno").count().orderBy(desc("anno")).show(10, truncate=False)

print("\nContratos por a√±o:")
df_silver.groupBy("anno").count().orderBy(desc("count")).show(10, truncate=False)

print("\nTop fechas de firma:")
df_silver.groupBy("fecha_firma").count().orderBy(desc("count")).show(10, truncate=False)



Top a√±os:
+----+-----+
|anno|count|
+----+-----+
|2025|68   |
|2024|95797|
|2023|3029 |
|2022|564  |
+----+-----+


Contratos por a√±o:
+----+-----+
|anno|count|
+----+-----+
|2024|95797|
|2023|3029 |
|2022|564  |
|2025|68   |
+----+-----+


Top fechas de firma:
+-----------+-----+
|fecha_firma|count|
+-----------+-----+
|2024-02-01 |1230 |
|2024-03-01 |1119 |
|2024-02-02 |860  |
|2024-02-05 |815  |
|2024-03-22 |794  |
|2024-02-09 |789  |
|2024-02-16 |744  |
|2024-02-06 |723  |
|2024-02-12 |715  |
|2024-09-02 |686  |
+-----------+-----+
only showing top 10 rows



In [25]:
# ============================================================================
# 5. GUARDAR EN DELTA LAKE
# ============================================================================

print("="*80)
print("PASO 5: GUARDAR EN DELTA LAKE")
print("="*80 + "\n")

DELTA_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"üíæ Guardando en: {DELTA_PATH}")

df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(DELTA_PATH)

print("‚úÖ Guardado exitosamente\n")

# ‚ö†Ô∏è LIBERAR todo
df_silver.unpersist()
spark.catalog.clearCache()
 

PASO 5: GUARDAR EN DELTA LAKE

üíæ Guardando en: /app/notebooks/delta_lake/silver_contracts


                                                                                

‚úÖ Guardado exitosamente



## Fase 3

In [26]:
print("\n" + "="*80)
print("PASO 1: CARGAR DATOS DESDE SILVER")
print("="*80 + "\n")

SILVER_PATH = "/app/notebooks/delta_lake/silver_contracts"
print(f"üìä Cargando: {SILVER_PATH}")

df_silver = spark.read.format("delta").load(SILVER_PATH)
df_silver = df_silver.cache()
total_records = df_silver.count()

print(f"‚úì Registros: {total_records:,}\n")


PASO 1: CARGAR DATOS DESDE SILVER

üìä Cargando: /app/notebooks/delta_lake/silver_contracts




‚úì Registros: 99,458



                                                                                

In [27]:
# ============================================================================
# PASO 2: LIMPIEZA DE TEXTO
# ============================================================================

print("="*80)
print("PASO 2: LIMPIEZA DE TEXTO")
print("="*80 + "\n")

src_chars = "√°√©√≠√≥√∫√º√±"
dst_chars = "aeiouun"

df_prepared = df_silver.withColumn(
    "objeto_limpio",
    trim(
        regexp_replace(
            regexp_replace(
                translate(
                    lower(col("objeto_contrato")),
                    src_chars,
                    dst_chars
                ),
                "[^a-z0-9\\s]", " "
            ),
            "\\s+", " "
        )
    )
).filter(length(col("objeto_limpio")) >= 10)

print(f"‚úì Registros despu√©s de limpieza: {df_prepared.count():,}\n")


PASO 2: LIMPIEZA DE TEXTO





‚úì Registros despu√©s de limpieza: 99,458



                                                                                

In [28]:
# ============================================================================
# PASO 3: TOKENIZACI√ìN Y STOPWORDS
# ============================================================================

print("="*80)
print("PASO 3: TOKENIZACI√ìN")
print("="*80 + "\n")

stopwords_es = [
    "el", "la", "de", "que", "y", "a", "en", "un", "ser", "se", "no",
    "por", "con", "su", "para", "como", "estar", "tener", "le", "lo",
    "pero", "hacer", "o", "este", "otro", "ese", "si", "ya", "ver",
    "dar", "muy", "sin", "sobre", "tambi√©n", "hasta", "a√±o", "entre",
    "del", "al", "los", "las", "uno", "una", "unos", "unas",
    "contrato", "contratos", "objeto", "prestacion", "prestaci√≥n",
    "servicio", "servicios", "suministro", "ejecucion", "ejecuci√≥n"
]

tokenizer = Tokenizer(inputCol="objeto_limpio", outputCol="palabras")
df_tokenized = tokenizer.transform(df_prepared)

remover = StopWordsRemover(
    inputCol="palabras",
    outputCol="palabras_sin_stopwords",
    stopWords=stopwords_es
)
df_filtered_words = remover.transform(df_tokenized)

# Filtrar palabras cortas
def clean_words(words):
    if not words:
        return []
    return [w for w in words if len(w) >= 3]

clean_udf = udf(clean_words, ArrayType(StringType()))

df_filtered = df_filtered_words.withColumn(
    "palabras_filtradas",
    clean_udf(col("palabras_sin_stopwords"))
).filter(size(col("palabras_filtradas")) > 0)

print(f"‚úì Registros despu√©s de filtrado: {df_filtered.count():,}\n")


PASO 3: TOKENIZACI√ìN





‚úì Registros despu√©s de filtrado: 99,458



                                                                                

In [29]:
# ============================================================================
# PASO 4: WORD2VEC
# ============================================================================

print("="*80)
print("PASO 4: WORD2VEC")
print("="*80 + "\n")

word2vec = Word2Vec(
    vectorSize=100,
    minCount=2,
    maxIter=10,
    seed=42,
    inputCol="palabras_filtradas",
    outputCol="embedding_raw"
)

print("‚è≥ Entrenando Word2Vec...")
word2vec_model = word2vec.fit(df_filtered)
df_embeddings = word2vec_model.transform(df_filtered)

vocab_size = len(word2vec_model.getVectors().collect())
print(f"‚úì Vocabulario: {vocab_size:,} palabras")
print(f"‚úì Embeddings¬†generados\n")


PASO 4: WORD2VEC

‚è≥ Entrenando Word2Vec...


                                                                                

‚úì Vocabulario: 14,465 palabras
‚úì Embeddings¬†generados



In [31]:
# ============================================================================
# PASO 5: TARGET ENCODING (SOBRE df_embeddings)
# ============================================================================
import pyspark.sql.functions as F

print("="*80)
print("PASO 5: TARGET ENCODING")
print("="*80 + "\n")

def target_encode_smooth(df, cat_col, target_col, m=50):
    """Target Encoding suavizado"""
    global_mean = df.agg(F.mean(target_col)).first()[0]
    
    stats = (
        df.groupBy(cat_col)
        .agg(
            F.mean(target_col).alias("cat_mean"),
            F.count(target_col).alias("cat_count")
        )
        .withColumn(
            f"{cat_col}_te",
            (F.col("cat_count") * F.col("cat_mean") + m * F.lit(global_mean))
            / (F.col("cat_count") + m)
        )
        .select(cat_col, f"{cat_col}_te")
    )
    
    return df.join(stats, on=cat_col, how="left")

# IMPORTANTE: Aplicar sobre df_embeddings (que tiene embedding_raw)
df_te = df_embeddings

categorical_cols = ["entidad", "codigo_unspsc", "tipo_contrato", "estado_contrato", "modalidad"]

for col_name in categorical_cols:
    print(f"üìä Codificando {col_name}...")
    df_te = target_encode_smooth(df_te, col_name, target_col="valor_contrato", m=50)
    print(f"   ‚Üí {col_name}_te creado")

print("\n‚úì Target Encoding¬†completado\n")


PASO 5: TARGET ENCODING

üìä Codificando entidad...


                                                                                

   ‚Üí entidad_te creado
üìä Codificando codigo_unspsc...


                                                                                

   ‚Üí codigo_unspsc_te creado
üìä Codificando tipo_contrato...


                                                                                

   ‚Üí tipo_contrato_te creado
üìä Codificando estado_contrato...


                                                                                

   ‚Üí estado_contrato_te creado
üìä Codificando modalidad...




   ‚Üí modalidad_te creado

‚úì Target Encoding¬†completado



                                                                                

In [33]:
# ============================================================================
# VERIFICAR VARIANZA DE VARIABLES TARGET ENCODED
# ============================================================================

print("="*80)
print("VERIFICACI√ìN: Varianza de variables Target Encoded")
print("="*80 + "\n")

# Columnas num√©ricas generadas por Target Encoding
te_columns = ["entidad_te", "codigo_unspsc_te", "tipo_contrato_te", "estado_contrato_te", "modalidad_te"]
valid_te_columns = []

print("Verificando columnas con Target Encoding (_te):\n")

for col in te_columns:
    # Verificar existencia
    if col not in df_te.columns:
        print(f"{col}:")
        print("  COLUMNA NO EXISTE (Target Encoding fall√≥)\n")
        continue
    
    # Calcular estad√≠sticas
    stats = (
        df_te
        .select(
            F.variance(col).alias("variance"),
            F.stddev(col).alias("std"),
            F.min(col).alias("min"),
            F.max(col).alias("max"),
            F.count(col).alias("count")
        )
        .first()
    )
    
    variance = stats["variance"]
    std = stats["std"]
    min_val = stats["min"]
    max_val = stats["max"]
    count = stats["count"]
    
    print(f"{col}:")
    print(f"  Count:    {count}")
    print(f"  Min:      {min_val if min_val is not None else 'None'}")
    print(f"  Max:      {max_val if max_val is not None else 'None'}")
    print(f"  Variance: {variance if variance is not None else 'None'}")
    print(f"  Std:      {std if std is not None else 'None'}")
    
    # Criterio de validez
    if variance is None or variance == 0 or std is None or std == 0:
        print("  ELIMINAR (sin varianza)\n")
    else:
        print("  CONSERVAR (tiene varianza)\n")
        valid_te_columns.append(col)


print(f"Resultado: {len(valid_te_columns)}/{len(te_columns)} variables v√°lidas\n")

# Si ninguna variable target-encoded tiene varianza
if len(valid_te_columns) == 0:
    print("ADVERTENCIA: Ninguna variable tiene varianza.")
    print("Posibles causas:")
    print("1. Target Encoding fall√≥ (todas las categor√≠as tienen el mismo promedio)")
    print("2. Solo hay una categor√≠a √∫nica en la variable")
    print("3. Los datos est√°n muy balanceados o homog√©neos\n")
    
    print("Mostrando muestra de datos para diagn√≥stico:")
    df_te.select(
        "entidad", "entidad_te",
        "modalidad", "modalidad_te",
        "valor_contrato"
    ).show(10, truncate=False)
    
    print("\nValores √∫nicos por variable original:")
    for col_orig in ["entidad", "codigo_unspsc", "departamento", "modalidad"]:
        unique_count = df_te.select(col_orig).distinct().count()
        print(f"- {col_orig}: {unique_count} valores √∫nicos\n")

else:
    print(f"Variables v√°lidas: {valid_te_columns}\n")


VERIFICACI√ìN: Varianza de variables Target Encoded

Verificando columnas con Target Encoding (_te):



                                                                                

entidad_te:
  Count:    99458
  Min:      14406872.390959457
  Max:      1105320127.1465836
  Variance: 9594105497661858.0
  Std:      97949504.83622599
  CONSERVAR (tiene varianza)



                                                                                

codigo_unspsc_te:
  Count:    99458
  Min:      17057530.61269573
  Max:      4494832645.981172
  Variance: 1.0081318874149584e+16
  Std:      100405771.11973985
  CONSERVAR (tiene varianza)



                                                                                

tipo_contrato_te:
  Count:    99458
  Min:      60305043.46804067
  Max:      6341257341.517739
  Variance: 2.160509100008716e+16
  Std:      146986703.48057732
  CONSERVAR (tiene varianza)



                                                                                

estado_contrato_te:
  Count:    99458
  Min:      28069428.3107082
  Max:      2146138087.3567228
  Variance: 1.1824545100141602e+16
  Std:      108740724.20276408
  CONSERVAR (tiene varianza)





modalidad_te:
  Count:    99458
  Min:      31966280.59774187
  Max:      3863781158.8434043
  Variance: 1.201264388683272e+17
  Std:      346592612.25295496
  CONSERVAR (tiene varianza)

Resultado: 5/5 variables v√°lidas

Variables v√°lidas: ['entidad_te', 'codigo_unspsc_te', 'tipo_contrato_te', 'estado_contrato_te', 'modalidad_te']



                                                                                

In [34]:
# ============================================================================
# PASO 6: ENSAMBLAR FEATURES
# ============================================================================

print("="*80)
print("PASO 6: ENSAMBLAR FEATURES")
print("="*80 + "\n")

# Imputar duracion_dias
df_te = df_te.fillna({"duracion_dias": 0})

# Verificar duracion_dias
duracion_variance = df_te.select(F.variance("duracion_dias")).first()[0]
if duracion_variance and duracion_variance > 0:
    valid_te_columns.append("duracion_dias")
    print("‚úÖ duracion_dias incluida\n")

input_cols = ["embedding_raw"] + valid_te_columns

print("üìä Features a ensamblar:")
for col in input_cols:
    print(f"  ‚úì {col}")

assembler = VectorAssembler(
    inputCols=input_cols,
    outputCol="features_raw",
    handleInvalid="skip"
)

df_assembled = assembler.transform(df_te)
feature_dim = len(df_assembled.select("features_raw").first()[0])

print(f"\n‚úì Dimensi√≥n: {feature_dim} dims\n")
 

PASO 6: ENSAMBLAR FEATURES



                                                                                

‚úÖ duracion_dias incluida

üìä Features a ensamblar:
  ‚úì embedding_raw
  ‚úì entidad_te
  ‚úì codigo_unspsc_te
  ‚úì tipo_contrato_te
  ‚úì estado_contrato_te
  ‚úì modalidad_te
  ‚úì duracion_dias


                                                                                


‚úì Dimensi√≥n: 106 dims



In [35]:
# ============================================================================
# PASO 7: NORMALIZAR
# ============================================================================

print("="*80)
print("PASO 7: NORMALIZACI√ìN")
print("="*80 + "\n")

scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features_scaled",
    withStd=True,
    withMean=True
)

scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)

print("‚úì Features normalizadas\n")
 

PASO 7: NORMALIZACI√ìN





‚úì Features normalizadas



                                                                                

In [36]:
# ============================================================================
# PASO 8: AN√ÅLISIS DE CORRELACI√ìN
# ============================================================================

print("="*80)
print("PASO 8: AN√ÅLISIS DE CORRELACI√ìN")
print("="*80 + "\n")

# 8.1: Correlaci√≥n de variables categ√≥ricas
print("üìä Correlaci√≥n de variables categ√≥ricas:")

cat_correlations = {}

for var in valid_te_columns:
    try:
        assembler_pair = VectorAssembler(
            inputCols=[var, "valor_contrato"],
            outputCol="features_pair",
            handleInvalid="skip"
        )
        
        df_pair = assembler_pair.transform(df_scaled)
        corr_matrix = Correlation.corr(df_pair, "features_pair", "pearson").collect()[0][0]
        corr_value = corr_matrix.toArray()[0, 1]
        
        cat_correlations[var] = corr_value
        print(f"  {var:<25} ‚Üí {corr_value:>7.4f}")
    except Exception as e:
        print(f"  {var:<25} ‚Üí ERROR")

print()

# 8.2: Correlaci√≥n de embeddings (muestra)
print("üìä Correlaci√≥n de embeddings (muestra 10%):")

SAMPLE_FRACTION = 0.1
df_sample = df_scaled.sample(withReplacement=False, fraction=SAMPLE_FRACTION, seed=42)
sample_size = df_sample.count()
print(f"  Muestra: {sample_size:,} registros\n")

data_sample = df_sample.select("embedding_raw", "valor_contrato").collect()
embeddings_array = np.array([row["embedding_raw"].toArray() for row in data_sample])
target_array = np.array([row["valor_contrato"] for row in data_sample])

embedding_correlations = {}

for i in range(100):
    correlation = np.corrcoef(embeddings_array[:, i], target_array)[0, 1]
    embedding_correlations[f"emb_{i}"] = correlation
    
    if (i + 1) % 25 == 0:
        print(f"  Procesadas {i + 1}/100 dimensiones...")

print("\n‚úì Correlaciones calculadas\n")

# Consolidar
all_correlations = {**cat_correlations, **embedding_correlations}
valid_correlations = {k: v for k, v in all_correlations.items() if not np.isnan(v)}

 

PASO 8: AN√ÅLISIS DE CORRELACI√ìN

üìä Correlaci√≥n de variables categ√≥ricas:


                                                                                

  entidad_te                ‚Üí  0.0990


                                                                                

  codigo_unspsc_te          ‚Üí  0.3300


                                                                                

  tipo_contrato_te          ‚Üí  0.2462


                                                                                

  estado_contrato_te        ‚Üí  0.1114


                                                                                

  modalidad_te              ‚Üí  0.3278


                                                                                

  duracion_dias             ‚Üí  0.0268

üìä Correlaci√≥n de embeddings (muestra 10%):


                                                                                

  Muestra: 10,059 registros



                                                                                

  Procesadas 25/100 dimensiones...
  Procesadas 50/100 dimensiones...
  Procesadas 75/100 dimensiones...
  Procesadas 100/100 dimensiones...

‚úì Correlaciones calculadas



In [38]:
# ============================================================================
# PASO 9: SELECCI√ìN DE VARIABLES
# ============================================================================
import builtins

print("="*80)
print("PASO 9: SELECCI√ìN DE VARIABLES")
print("="*80 + "\n")

THRESHOLD = 0.05

selected_vars = {var: corr for var, corr in valid_correlations.items() 
                 if builtins.abs(corr) >= THRESHOLD}

print(f"üìä Umbral: |r| >= {THRESHOLD}")
print(f"  ‚úÖ Seleccionadas: {len(selected_vars)}")
print(f"  ‚ùå Rechazadas: {len(valid_correlations) - len(selected_vars)}")

selected_cat = [v for v in selected_vars.keys() if not v.startswith("emb_")]
selected_emb = [v for v in selected_vars.keys() if v.startswith("emb_")]

print(f"\n  Categ√≥ricas: {len(selected_cat)}")
print(f"  Embeddings: {len(selected_emb)}")

print("\nüìä Top 10 variables:")
sorted_vars = sorted(selected_vars.items(), key=lambda x: builtins.abs(x[1]), reverse=True)
for i, (var, corr) in enumerate(sorted_vars[:10], 1):
    print(f"  {i:2d}. {var:<25} ‚Üí {corr:>7.4f}")
 

PASO 9: SELECCI√ìN DE VARIABLES

üìä Umbral: |r| >= 0.05
  ‚úÖ Seleccionadas: 19
  ‚ùå Rechazadas: 87

  Categ√≥ricas: 5
  Embeddings: 14

üìä Top 10 variables:
   1. codigo_unspsc_te          ‚Üí  0.3300
   2. modalidad_te              ‚Üí  0.3278
   3. tipo_contrato_te          ‚Üí  0.2462
   4. estado_contrato_te        ‚Üí  0.1114
   5. entidad_te                ‚Üí  0.0990
   6. emb_37                    ‚Üí  0.0734
   7. emb_68                    ‚Üí  0.0721
   8. emb_73                    ‚Üí  0.0690
   9. emb_92                    ‚Üí  0.0681
  10. emb_75                    ‚Üí  0.0678


In [40]:
# ============================================================================
# PASO 10: FILTRAR EMBEDDING
# ============================================================================
from pyspark.ml.linalg import Vectors, VectorUDT

print("\n" + "="*80)
print("PASO 10: FILTRAR FEATURES")
print("="*80 + "\n")

if len(selected_emb) > 0:
    selected_emb_indices = sorted([int(var.split("_")[1]) for var in selected_emb])
    
    def filter_embedding_udf(indices):
        def filter_func(vector):
            if vector is None:
                return Vectors.dense([0.0] * len(indices))
            return Vectors.dense([float(vector[i]) for i in indices])
        return F.udf(filter_func, VectorUDT())
    
    df_filtered = df_scaled.withColumn(
        "embedding_filtered",
        filter_embedding_udf(selected_emb_indices)(F.col("embedding_raw"))
    )
    
    embedding_dim = len(selected_emb_indices)
    input_cols_filtered = ["embedding_filtered"] + selected_cat
else:
    df_filtered = df_scaled
    embedding_dim = 0
    input_cols_filtered = selected_cat

total_selected = embedding_dim + len(selected_cat)

print(f"üìä Features seleccionadas:")
print(f"  - Embedding: {embedding_dim} dims")
print(f"  - Categ√≥ricas: {len(selected_cat)} dims")
print(f"  - TOTAL: {total_selected} dims")
print(f"  - Reducci√≥n: {(1 - total_selected/feature_dim)*100:.1f}%\n")

assembler_filtered = VectorAssembler(
    inputCols=input_cols_filtered,
    outputCol="features_selected",
    handleInvalid="skip"
)

df_assembled_filtered = assembler_filtered.transform(df_filtered)
selected_dim = len(df_assembled_filtered.select("features_selected").first()[0])

print("‚úì Features filtradas ensambladas\n")

 


PASO 10: FILTRAR FEATURES

üìä Features seleccionadas:
  - Embedding: 14 dims
  - Categ√≥ricas: 5 dims
  - TOTAL: 19 dims
  - Reducci√≥n: 82.1%





‚úì Features filtradas ensambladas



                                                                                

In [41]:
# ============================================================================
# PASO 11: NORMALIZAR FILTRADAS
# ============================================================================

print("="*80)
print("PASO 11: NORMALIZAR FEATURES FILTRADAS")
print("="*80 + "\n")

scaler_filtered = StandardScaler(
    inputCol="features_selected",
    outputCol="features_scaled_filtered",
    withStd=True,
    withMean=True
)

scaler_model_filtered = scaler_filtered.fit(df_assembled_filtered)
df_scaled_filtered = scaler_model_filtered.transform(df_assembled_filtered)

print("‚úì Normalizadas\n")
 

PASO 11: NORMALIZAR FEATURES FILTRADAS



                                                                                

‚úì Normalizadas



In [90]:
# ============================================================================
# PASO 9: PCA
# ============================================================================

print("="*80)
print("PASO 9: PCA SOBRE FEATURES FILTRADAS")
print("="*80 + "\n")

pca = PCA(
    k=selected_dim,
    inputCol="features_scaled_filtered",
    outputCol="features_pca"
)

print("‚è≥ Entrenando PCA...")
pca_model = pca.fit(df_scaled_filtered)
df_pca = pca_model.transform(df_scaled_filtered)
print("‚úì PCA aplicado\n")

# Analizar varianza
explained_variance = pca_model.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)

n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f"üìä Componentes para 95% varianza: {n_components_95}")

print("\nüìä Varianza explicada:")
for i in range(builtins.min(10, selected_dim)):
    print(f"  - PC{i+1}: {explained_variance[i]:.2%}")

print("\nüìä Varianza acumulada:")
thresholds = [5, 10, 20, 30, 50, selected_dim]
for i in thresholds:
    if i <= len(cumulative_variance):
        print(f"  - {i:3d} componentes: {cumulative_variance[i-1]:.2%}")


PASO 9: PCA SOBRE FEATURES FILTRADAS

‚è≥ Entrenando PCA...




‚úì PCA aplicado

üìä Componentes para 95% varianza: 23

üìä Varianza explicada:
  - PC1: 13.02%
  - PC2: 9.70%
  - PC3: 6.45%
  - PC4: 5.84%
  - PC5: 5.48%
  - PC6: 5.11%
  - PC7: 4.53%
  - PC8: 4.37%
  - PC9: 4.10%
  - PC10: 3.83%

üìä Varianza acumulada:
  -   5 componentes: 40.49%
  -  10 componentes: 62.44%
  -  20 componentes: 90.21%
  -  26 componentes: 100.00%


                                                                                

In [42]:
# ============================================================================
# PASO 12: PCA
# ============================================================================

print("="*80)
print("PASO 12: PCA")
print("="*80 + "\n")

pca = PCA(
    k=selected_dim,
    inputCol="features_scaled_filtered",
    outputCol="features_pca"
)

print("‚è≥ Entrenando PCA...")
pca_model = pca.fit(df_scaled_filtered)
df_pca = pca_model.transform(df_scaled_filtered)

explained_variance = pca_model.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f"‚úì PCA aplicado\n")
print(f"üìä Componentes para 95% varianza: {n_components_95}")
print(f"üìä PC1 varianza: {explained_variance[0]:.2%}")
print(f"üìä Top 10 varianza: {cumulative_variance[9]:.2%}\n")
 

PASO 12: PCA

‚è≥ Entrenando PCA...


                                                                                

‚úì PCA aplicado

üìä Componentes para 95% varianza: 18
üìä PC1 varianza: 12.50%
üìä Top 10 varianza: 68.46%



In [43]:
# ============================================================================
# PASO 13: DATASET FINAL
# ============================================================================

print("="*80)
print("PASO 13: DATASET FINAL")
print("="*80 + "\n")

df_final = df_pca.select(
    "id_contrato",
    "objeto_contrato",
    "entidad",
    "departamento",
    "region",
    "codigo_unspsc",
    "valor_contrato",
    "duracion_dias",
    "fecha_firma",
    "features_pca",
    "features_scaled_filtered",
    "features_scaled"
)

df_final = df_final.cache()
total_final = df_final.count()

print(f"‚úì Dataset final: {total_final:,} registros\n")
print("üìä Opciones de features:")
print(f"  1. features_pca: {n_components_95} dims")
print(f"  2. features_scaled_filtered: {selected_dim} dims")
print(f"  3. features_scaled: {feature_dim} dims\n")

PASO 13: DATASET FINAL





‚úì Dataset final: 99,458 registros

üìä Opciones de features:
  1. features_pca: 18 dims
  2. features_scaled_filtered: 19 dims
  3. features_scaled: 106 dims



                                                                                

In [82]:
# ============================================================================
# 10. AN√ÅLISIS DE CORRELACIONES
# ============================================================================

print("\n" + "="*80)
print("PASO 10: AN√ÅLISIS DE CORRELACIONES")
print("="*80 + "\n")

print("üìä Calculando correlaciones de features PCA con valor_contrato...")

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf as spark_udf, col

# Funci√≥n para agregar target al vector PCA
def add_target_to_vector(features, target):
    return Vectors.dense(list(features.toArray()) + [float(target)])

add_target_udf = spark_udf(add_target_to_vector, VectorUDT())

df_corr = df_final.withColumn(
    "features_with_target",
    add_target_udf(col("features_pca"), col("valor_contrato"))
)

# Calcular matriz de correlaci√≥n
print("‚è≥ Calculando matriz de correlaci√≥n...")
correlation_matrix = Correlation.corr(df_corr, "features_with_target", "pearson")

# Extraer matriz como array numpy
corr_array = correlation_matrix.collect()[0][0].toArray()

# Correlaciones del target (√∫ltima fila, excepto √∫ltimo elemento)
target_correlations = corr_array[-1, :-1]

print("\nüìä Correlaciones de PCA con valor_contrato:")
max_idx = np.argmax(np.abs(target_correlations))

print(f"   - Componente m√°s correlacionado: PC{max_idx+1} ({target_correlations[max_idx]:.3f})")
print(f"   - Top 5 componentes:")

top_5_indices = np.argsort(np.abs(target_correlations))[-5:][::-1]
for idx in top_5_indices:
    print(f"     PC{idx+1}: {target_correlations[idx]:.3f}")



PASO 10: AN√ÅLISIS DE CORRELACIONES

üìä Calculando correlaciones de features PCA con valor_contrato...
‚è≥ Calculando matriz de correlaci√≥n...





üìä Correlaciones de PCA con valor_contrato:
   - Componente m√°s correlacionado: PC103 (nan)
   - Top 5 componentes:
     PC105: nan
     PC103: nan
     PC104: nan
     PC24: -0.149
     PC29: -0.120


                                                                                

In [45]:
# ============================================================================
# PASO 14: GUARDAR
# ============================================================================

GOLD_PATH = "/app/notebooks/delta_lake/gold_features_v2"
MODELS_PATH = "/app/notebooks/models_v2"

print("="*80)
print("GUARDANDO")
print("="*80 + "\n")

print(f"üìä Dataset: {GOLD_PATH}")
df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(GOLD_PATH)

print(f"üìä Modelos: {MODELS_PATH}")
word2vec_model.save(f"{MODELS_PATH}/word2vec_model")
pca_model.save(f"{MODELS_PATH}/pca_model")
scaler_model.save(f"{MODELS_PATH}/scaler_model")
scaler_model_filtered.save(f"{MODELS_PATH}/scaler_filtered_model")

print("\n‚úÖ FASE 3 COMPLETADA")

GUARDANDO

üìä Dataset: /app/notebooks/delta_lake/gold_features_v2


                                                                                

üìä Modelos: /app/notebooks/models_v2


                                                                                


‚úÖ FASE 3 COMPLETADA


## Fase 4

In [4]:
# ============================================================================
# PASO 1: CARGAR DATOS DESDE GOLD
# ============================================================================

print("="*80)
print("PASO 1: CARGA DE DATOS")
print("="*80 + "\n")

GOLD_PATH = "/app/notebooks/delta_lake/gold_features_v2"

print(f"üìä Cargando: {GOLD_PATH}")
df_gold = spark.read.format("delta").load(GOLD_PATH)

df_gold = df_gold.cache()
total_records = df_gold.count()

print(f"‚úì Registros cargados: {total_records:,}")
print(f"‚úì Columnas: {len(df_gold.columns)}\n")

print("üìä Features disponibles:")
feature_cols = [c for c in df_gold.columns if "features" in c]

for i, c in enumerate(feature_cols, 1):
    sample_dim = len(df_gold.select(c).first()[0])
    print(f"  {i}. {c}: {sample_dim} dims")

print()


PASO 1: CARGA DE DATOS

üìä Cargando: /app/notebooks/delta_lake/gold_features_v2


                                                                                

‚úì Registros cargados: 99,458
‚úì Columnas: 12

üìä Features disponibles:
  1. features_pca: 19 dims
  2. features_scaled_filtered: 19 dims
  3. features_scaled: 106 dims



In [11]:
# ============================================================================
# PASO 2: SELECCI√ìN DE FEATURES Y TARGET
# ============================================================================

print("="*80)
print("PASO 2: PREPARACI√ìN DE DATOS")
print("="*80 + "\n")

FEATURE_COL = "features_scaled"
TARGET_COL = "valor_contrato"

print(f"üìä Features seleccionadas: {FEATURE_COL}")
print(f"üìä Target: {TARGET_COL}\n")

df_model = df_gold.select(
    col(FEATURE_COL).alias("features"),
    col(TARGET_COL).alias("label"),
    "id_contrato",
    "fecha_firma",
    "entidad"
).filter(
    col("features").isNotNull() &
    col("label").isNotNull() &
    (col("label") > 0)
)

df_model = df_model.cache()
total_model = df_model.count()

print(f"‚úì Registros v√°lidos: {total_model:,}")
print(f"‚úì Descartados: {total_records - total_model:,}\n")

stats = df_model.select(
    min("label").alias("min"),
    max("label").alias("max"),
    avg("label").alias("mean"),
    stddev("label").alias("std")
).first()

print("üìä Estad√≠sticas de valor_contrato:")
print(f"  Min:  ${stats['min']:,.2f}")
print(f"  Max:  ${stats['max']:,.2f}")
print(f"  Mean: ${stats['mean']:,.2f}")
print(f"  Std:  ${stats['std']:,.2f}\n")


PASO 2: PREPARACI√ìN DE DATOS

üìä Features seleccionadas: features_scaled
üìä Target: valor_contrato

‚úì Registros v√°lidos: 99,458
‚úì Descartados: 0

üìä Estad√≠sticas de valor_contrato:
  Min:  $1.00
  Max:  $150,838,540,149.00
  Mean: $99,414,663.22
  Std:  $1,152,118,650.44



In [12]:
# ============================================================================
# PASO 3: TRAIN/TEST SPLIT
# ============================================================================
from datetime import datetime

print("="*80)
print("PASO 3: DIVISI√ìN TRAIN/TEST (CORREGIDO PARA FECHAS)")
print("="*80 + "\n")

# Convertimos la fecha a n√∫mero (timestamp largo)
df_temp = df_model.withColumn(
    "fecha_num",
    col("fecha_firma").cast("timestamp").cast("long")
)

# Obtenemos el percentil 80
q = df_temp.approxQuantile("fecha_num", [0.8], 0.01)
split_ts = q[0]

# Convertimos el n√∫mero a fecha
split_date = datetime.utcfromtimestamp(split_ts)

print(f"üìÖ Fecha de corte (percentil 80): {split_date}\n")

# Dividimos el dataset usando la fecha original
train_data = df_model.filter(col("fecha_firma") <= split_date).cache()
test_data  = df_model.filter(col("fecha_firma") > split_date).cache()

print(f"Train: {train_data.count():,}")
print(f"Test : {test_data.count():,}\n")


PASO 3: DIVISI√ìN TRAIN/TEST (CORREGIDO PARA FECHAS)

üìÖ Fecha de corte (percentil 80): 2024-09-27 00:00:00

Train: 79,298
Test : 20,160



In [13]:
# ============================================================================
# PASO 4: REGRESI√ìN LINEAL
# ============================================================================
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, stddev
print("="*80)
print("PASO 4: MODELO BASELINE - REGRESI√ìN LINEAL")
print("="*80 + "\n")

lr = LinearRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    regParam=0.1,
    elasticNetParam=0.8
)

lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

# Evaluadores
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator_mae  = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")

lr_rmse = evaluator_rmse.evaluate(lr_predictions)
lr_r2   = evaluator_r2.evaluate(lr_predictions)
lr_mae  = evaluator_mae.evaluate(lr_predictions)

# ===========================
# C√°lculo de sigma (œÉ)
# ===========================

lr_with_error = lr_predictions.withColumn(
    "error", col("prediction") - col("label")
)

sigma = lr_with_error.select(stddev("error").alias("sigma")).first()["sigma"]

print("üìä RESULTADOS - LINEAR REGRESSION:")
print(f"  RMSE:  ${lr_rmse:,.2f}")
print(f"  MAE:   ${lr_mae:,.2f}")
print(f"  R¬≤:    {lr_r2:.4f}")
print(f"  Sigma: ${sigma:,.2f}\n")


PASO 4: MODELO BASELINE - REGRESI√ìN LINEAL

üìä RESULTADOS - LINEAR REGRESSION:
  RMSE:  $1,535,092,233.12
  MAE:   $257,007,310.67
  R¬≤:    0.1446
  Sigma: $1,535,117,975.89



In [22]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql.functions import col, min, max, avg, stddev
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from datetime import datetime
import mlflow
import mlflow.spark
import json

# ============================================================================
# PASO 1: CARGAR DATOS DESDE GOLD
# ============================================================================

print("="*80)
print("PASO 1: CARGA DE DATOS")
print("="*80 + "\n")

GOLD_PATH = "/app/notebooks/delta_lake/gold_features_v2"

print(f"üìä Cargando: {GOLD_PATH}")
df_gold = spark.read.format("delta").load(GOLD_PATH)

df_gold = df_gold.cache()
total_records = df_gold.count()

print(f"‚úì Registros cargados: {total_records:,}")
print(f"‚úì Columnas: {len(df_gold.columns)}\n")

print("üìä Features disponibles:")
feature_cols = [c for c in df_gold.columns if "features" in c]

for i, c in enumerate(feature_cols, 1):
    sample_dim = len(df_gold.select(c).first()[0])
    print(f"  {i}. {c}: {sample_dim} dims")

print()

# ============================================================================
# PASO 2: SELECCI√ìN DE FEATURES Y TARGET
# ============================================================================

print("="*80)
print("PASO 2: PREPARACI√ìN DE DATOS")
print("="*80 + "\n")

FEATURE_COL = "features_scaled_filtered"
TARGET_COL = "valor_contrato"

print(f"üìä Features seleccionadas: {FEATURE_COL}")
print(f"üìä Target: {TARGET_COL}\n")

df_model = df_gold.select(
    col(FEATURE_COL).alias("features"),
    col(TARGET_COL).alias("label"),
    "id_contrato",
    "fecha_firma",
    "entidad"
).filter(
    col("features").isNotNull() &
    col("label").isNotNull() &
    (col("label") > 0)
)

df_model = df_model.cache()
total_model = df_model.count()

print(f"‚úì Registros v√°lidos: {total_model:,}")
print(f"‚úì Descartados: {total_records - total_model:,}\n")

stats = df_model.select(
    min("label").alias("min"),
    max("label").alias("max"),
    avg("label").alias("mean"),
    stddev("label").alias("std")
).first()

print("üìä Estad√≠sticas de valor_contrato:")
print(f"  Min:  ${stats['min']:,.2f}")
print(f"  Max:  ${stats['max']:,.2f}")
print(f"  Mean: ${stats['mean']:,.2f}")
print(f"  Std:  ${stats['std']:,.2f}\n")


# ============================================================================
# PASO 3: TRAIN/TEST SPLIT (CORREGIDO)
# ============================================================================

print("="*80)
print("PASO 3: DIVISI√ìN TRAIN/TEST (CORREGIDO PARA FECHAS)")
print("="*80 + "\n")

# Convertimos fecha a n√∫mero para calcular percentil
df_temp = df_model.withColumn(
    "fecha_num",
    col("fecha_firma").cast("timestamp").cast("long")
)

q = df_temp.approxQuantile("fecha_num", [0.8], 0.01)
split_ts = q[0]
split_date = datetime.utcfromtimestamp(split_ts)

print(f"üìÖ Fecha de corte (percentil 80): {split_date}\n")

train_data = df_model.filter(col("fecha_firma") <= split_date).cache()
test_data  = df_model.filter(col("fecha_firma") > split_date).cache()

print(f"Train: {train_data.count():,}")
print(f"Test : {test_data.count():,}\n")

# Guardar tama√±os
train_size = train_data.count()
test_size  = test_data.count()


# ============================================================================
# PASO 4: REGRESI√ìN LINEAL
# ============================================================================

print("="*80)
print("PASO 4: MODELO BASELINE - REGRESI√ìN LINEAL")
print("="*80 + "\n")

lr = LinearRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    regParam=0.1,
    elasticNetParam=0.8
)

lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

# Evaluadores
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator_mae  = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")

lr_rmse = evaluator_rmse.evaluate(lr_predictions)
lr_r2   = evaluator_r2.evaluate(lr_predictions)
lr_mae  = evaluator_mae.evaluate(lr_predictions)

# Sigma
lr_with_error = lr_predictions.withColumn("error", col("prediction") - col("label"))
sigma = lr_with_error.select(stddev("error").alias("sigma")).first()["sigma"]

print("üìä RESULTADOS - LINEAR REGRESSION:")
print(f"  RMSE:  ${lr_rmse:,.2f}")
print(f"  MAE:   ${lr_mae:,.2f}")
print(f"  R¬≤:    {lr_r2:.4f}")
print(f"  Sigma: ${sigma:,.2f}\n")


# ============================================================================
# PASO 9: GUARDAR MODELO EN MLFLOW
# ============================================================================

print("="*80)
print("PASO 9: GUARDAR MODELO CON MLFLOW")
print("="*80 + "\n")

MODELS_PATH = "/app/notebooks/models_v2"
mlflow.set_tracking_uri("http://172.17.0.1:5000")
mlflow.set_experiment("contract_value_regression")

RANDOM_SEED = 42

with mlflow.start_run(run_name="linear_regression_v1"):

    print("üìä Registrando par√°metros...")
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("regParam", 0.1)
    mlflow.log_param("elasticNetParam", 0.8)
    mlflow.log_param("maxIter", 100)
    mlflow.log_param("train_size", train_size)
    mlflow.log_param("test_size", test_size)
    mlflow.log_param("random_seed", RANDOM_SEED)
    print("‚úì Par√°metros registrados\n")

    print("üìä Registrando m√©tricas...")
    mlflow.log_metric("test_rmse", lr_rmse)
    mlflow.log_metric("test_r2", lr_r2)
    mlflow.log_metric("test_mae", lr_mae)
    mlflow.log_metric("sigma", sigma)
    mlflow.log_metric("anomaly_threshold", 2.8 * sigma)
    print("‚úì M√©tricas registradas\n")

    print("üìä Guardando modelo en MLflow...")
    mlflow.spark.log_model(
        spark_model=lr_model,
        artifact_path="linear_regression_model",
        registered_model_name="contract_value_predictor"
    )
    print("‚úì Modelo registrado\n")

    # Guardar sigma como artifact
    sigma_path_temp = "/tmp/sigma.txt"
    with open(sigma_path_temp, "w") as f:
        f.write(str(sigma))
    mlflow.log_artifact(sigma_path_temp, "model_artifacts")

    # Guardar metadatos
    metadata = {
        "model_type": "LinearRegression",
        "regParam": 0.1,
        "elasticNetParam": 0.8,
        "maxIter": 100,
        "train_size": train_size,
        "test_size": test_size,
        "test_rmse": float(lr_rmse),
        "test_r2": float(lr_r2),
        "test_mae": float(lr_mae),
        "sigma": float(sigma),
        "anomaly_threshold": float(2.8 * sigma)
    }

    metadata_path_temp = "/tmp/model_metadata.json"
    with open(metadata_path_temp, "w") as f:
        json.dump(metadata, f, indent=2)

    mlflow.log_artifact(metadata_path_temp, "model_artifacts")

    run_id = mlflow.active_run().info.run_id
    print(f"üìä MLflow Run ID: {run_id}\n")

print("‚úÖ Modelo guardado exitosamente en MLflow\n")


PASO 1: CARGA DE DATOS

üìä Cargando: /app/notebooks/delta_lake/gold_features_v2
‚úì Registros cargados: 99,458
‚úì Columnas: 12

üìä Features disponibles:
  1. features_pca: 19 dims
  2. features_scaled_filtered: 19 dims
  3. features_scaled: 106 dims

PASO 2: PREPARACI√ìN DE DATOS

üìä Features seleccionadas: features_pca
üìä Target: valor_contrato

‚úì Registros v√°lidos: 99,458
‚úì Descartados: 0

üìä Estad√≠sticas de valor_contrato:
  Min:  $1.00
  Max:  $150,838,540,149.00
  Mean: $99,414,663.22
  Std:  $1,152,118,650.44

PASO 3: DIVISI√ìN TRAIN/TEST (CORREGIDO PARA FECHAS)

üìÖ Fecha de corte (percentil 80): 2024-09-27 00:00:00

Train: 79,298
Test : 20,160

PASO 4: MODELO BASELINE - REGRESI√ìN LINEAL



2025/12/11 00:49:38 INFO mlflow.tracking.fluent: Experiment with name 'contract_value_regression' does not exist. Creating a new experiment.


üìä RESULTADOS - LINEAR REGRESSION:
  RMSE:  $1,537,548,264.71
  MAE:   $244,431,490.36
  R¬≤:    0.1419
  Sigma: $1,537,484,096.27

PASO 9: GUARDAR MODELO CON MLFLOW

üìä Registrando par√°metros...
‚úì Par√°metros registrados

üìä Registrando m√©tricas...
‚úì M√©tricas registradas

üìä Guardando modelo en MLflow...


Successfully registered model 'contract_value_predictor'.
2025/12/11 00:50:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: contract_value_predictor, version 1
Created version '1' of model 'contract_value_predictor'.


‚úì Modelo registrado

üìä MLflow Run ID: 793a87e9ca594961b0ca30aff71eb8cc

üèÉ View run linear_regression_v1 at: http://172.17.0.1:5000/#/experiments/2/runs/793a87e9ca594961b0ca30aff71eb8cc
üß™ View experiment at: http://172.17.0.1:5000/#/experiments/2
‚úÖ Modelo guardado exitosamente en MLflow

