## Fase 2

In [1]:
# ============================================================================
# FASE 2 - OPTIMIZADO PARA SPARK 3.5.1 + DELTA LAKE 3.0
# ============================================================================

# PASO 0: REINICIAR SPARK CON VERSIONES CORRECTAS
try:
    spark.stop()
except:
    pass

import time
time.sleep(3)

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, lower, regexp_replace, translate, length, trim

from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, Word2Vec, 
    StringIndexer, OneHotEncoder, VectorAssembler,
    StandardScaler, PCA
)
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
import numpy as np

spark = (
    SparkSession.builder
    .appName("Bronze_to_Silver_Optimized")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "io.delta:delta-spark_2.12:3.0.0")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "50")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print(f" Spark {spark.version} iniciado\n")


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ae27b7c2-6798-4cc7-89f8-5ef358020b28;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found io.delta#delta-spark_

 Spark 3.5.1 iniciado



In [13]:
# ============================================================================
# 1. LECTURA DE KAFKA
# ============================================================================

print("="*80)
print("PASO 1: LECTURA DE KAFKA")
print("="*80 + "\n")

contract_schema = StructType([
    StructField("id_contrato", StringType()),
    StructField("objeto_contrato", StringType()),
    StructField("entidad", StringType()),
    StructField("departamento", StringType()),
    StructField("municipio", StringType()),
    StructField("region", StringType()),
    StructField("codigo_unspsc", StringType()),
    StructField("descripcion_categoria", StringType()),
    StructField("valor_contrato", DoubleType()),
    StructField("duracion_dias", IntegerType()),
    StructField("fecha_firma", StringType()),
    StructField("tipo_contrato", StringType()),
    StructField("estado_contrato", StringType()),
    StructField("modalidad", StringType()),
    StructField("anno", IntegerType()),
    StructField("id_interno_sistema", StringType()),
    StructField("campo_vacio", StringType()),
    StructField("constante_1", StringType()),
    StructField("constante_2", IntegerType()),
    StructField("duplicate_id", StringType()),
    StructField("timestamp_carga", StringType())
])

print("Leyendo Kafka...")

df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "contratos-publicos") \
    .option("startingOffsets", "earliest") \
    .load()

df_bronze = df_kafka.select(
    from_json(col("value").cast("string"), contract_schema).alias("data")
).select("data.*")

df_bronze = df_bronze.cache()
total_kafka = df_bronze.count()

print(f" Mensajes: {total_kafka:,}\n")



PASO 1: LECTURA DE KAFKA

Leyendo Kafka...


[Stage 52:>                                                         (0 + 1) / 1]

 Mensajes: 100,698



                                                                                

In [14]:
# ============================================================================
# 2. ELIMINAR REDUNDANTES Y PREPARAR DATOS
# ============================================================================
print("="*80)
print("PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS")
print("="*80 + "\n")

# Columnas redundantes a eliminar
redundant_columns = [
    "id_interno_sistema",
    "campo_vacio",
    "constante_1",
    "constante_2",
    "duplicate_id",
    "timestamp_carga"
]

print(f" Eliminando {len(redundant_columns)} columnas redundantes...")
df_cleaned = df_bronze.drop(*redundant_columns)

print(f"Columnas restantes: {len(df_cleaned.columns)}")
print()

print(" Preparando campo fecha_firma...")
print("   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)")
print("   Convirtiendo a: date (2024-01-04)")

df_cleaned = (
    df_cleaned
    .withColumn("fecha_firma_temp", to_timestamp(col("fecha_firma")))
    .withColumn("fecha_firma", to_date(col("fecha_firma_temp")))
    .drop("fecha_firma_temp")
)

print(" Fecha convertida correctamente\n")

# Liberar bronze ahora que ya no lo necesitamos
print(" Liberando memoria de df_bronze...")
df_bronze.unpersist()
print(" Memoria liberada\n")

print("="*80)
print(f" Dataset preparado: {len(df_cleaned.columns)} columnas")
print("="*80 + "\n")


PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS

 Eliminando 6 columnas redundantes...
Columnas restantes: 15

 Preparando campo fecha_firma...
   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)
   Convirtiendo a: date (2024-01-04)
 Fecha convertida correctamente

 Liberando memoria de df_bronze...
 Memoria liberada

 Dataset preparado: 15 columnas



In [15]:
# ============================================================================
# CELDA 1: PREPARACI√ìN Y CONTEO INICIAL
# ============================================================================
print("="*80)
print("PASO 3: LIMPIEZA - PREPARACI√ìN")
print("="*80 + "\n")

print("Cacheando datos para an√°lisis...")
df_cleaned = df_cleaned.cache()
total_cleaned = df_cleaned.count()

print(f" Registros totales: {total_cleaned:,}\n")
print(f" Columnas: {len(df_cleaned.columns)}")
print(f" Datos cacheados¬†en¬†memoria\n")

PASO 3: LIMPIEZA - PREPARACI√ìN

Cacheando datos para an√°lisis...


[Stage 56:>                                                         (0 + 1) / 1]

 Registros totales: 100,698

 Columnas: 15
 Datos cacheados¬†en¬†memoria



                                                                                

In [16]:
# ============================================================================
# CELDA 2: AN√ÅLISIS DE NULOS (OPTIMIZADO)
# ============================================================================
print("="*80)
print("AN√ÅLISIS DE CALIDAD DE DATOS")
print("="*80 + "\n")

print("Analizando valores nulos en columnas cr√≠ticas...")

# Solo analizar columnas cr√≠ticas para ahorrar memoria
critical_columns = [
    "id_contrato",
    "objeto_contrato", 
    "valor_contrato",
    "fecha_firma",
    "entidad",
    "departamento",
    "duracion_dias"
]

# An√°lisis optimizado solo de columnas cr√≠ticas
null_analysis = df_cleaned.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in critical_columns if c in df_cleaned.columns
]).collect()[0].asDict()

print("üìä Valores nulos en columnas cr√≠ticas:\n")
has_nulls = False
for col_name in critical_columns:
    if col_name in null_analysis:
        null_count = null_analysis[col_name]
        if null_count > 0:
            has_nulls = True
            pct = (null_count / total_cleaned) * 100
            print(f"   ‚ö†  {col_name}: {null_count:,} ({pct:.1f}%)")

if not has_nulls:
    print("   ‚úÖ No hay valores nulos en columnas cr√≠ticas")

print()


AN√ÅLISIS DE CALIDAD DE DATOS

Analizando valores nulos en columnas cr√≠ticas...
üìä Valores nulos en columnas cr√≠ticas:

   ‚ö†  fecha_firma: 695 (0.7%)
   ‚ö†  duracion_dias: 50,350 (50.0%)



In [17]:
# ============================================================================
# CELDA 3: APLICAR FILTROS DE LIMPIEZA
# ============================================================================
print("="*80)
print("APLICANDO FILTROS DE CALIDAD")
print("="*80 + "\n")

print("Aplicando reglas de limpieza:")
print("  ‚úì id_contrato no nulo")
print("  ‚úì objeto_contrato no nulo")
print("  ‚úì valor_contrato no nulo y > 0")
print("  ‚úì fecha_firma no nula")
print()

# Aplicar filtros paso a paso
# NOTA: fecha_firma ya fue convertida a date en el Paso 2
df_silver = df_cleaned \
    .filter(col("id_contrato").isNotNull()) \
    .filter(col("objeto_contrato").isNotNull()) \
    .filter(col("valor_contrato").isNotNull()) \
    .filter(col("valor_contrato") > 0) \
    .filter(col("fecha_firma").isNotNull())

print("‚úÖ Filtros aplicados correctamente\n")


APLICANDO FILTROS DE CALIDAD

Aplicando reglas de limpieza:
  ‚úì id_contrato no nulo
  ‚úì objeto_contrato no nulo
  ‚úì valor_contrato no nulo y > 0
  ‚úì fecha_firma no nula

‚úÖ Filtros aplicados correctamente



In [18]:
print(f"Spark version: {spark.version}")

Spark version: 3.5.1


In [19]:
# ============================================================================
# CELDA 4: CACHEAR RESULTADOS Y GENERAR REPORTE
# ============================================================================
print("="*80)
print("FINALIZANDO LIMPIEZA")
print("="*80 + "\n")

print("Cacheando datos limpios...")
df_silver = df_silver.cache()
total_silver = df_silver.count()

# Calcular estad√≠sticas
registros_descartados = total_cleaned - total_silver
pct_retenido = (total_silver / total_cleaned) * 100 if total_cleaned > 0 else 0
pct_descartado = (registros_descartados / total_cleaned) * 100 if total_cleaned > 0 else 0

print("\n" + "="*80)
print("üìä RESUMEN DE LIMPIEZA")
print("="*80)
print(f"  Registros iniciales:    {total_cleaned:,}")
print(f"  Registros finales:      {total_silver:,} ({pct_retenido:.1f}%)")
print(f"  Registros descartados:  {registros_descartados:,} ({pct_descartado:.1f}%)")
print("="*80 + "\n")

# Liberar memoria del DataFrame anterior
print("Liberando memoria del cache anterior...")
df_cleaned.unpersist()
print("‚úÖ Limpieza¬†completada\n")


FINALIZANDO LIMPIEZA

Cacheando datos limpios...

üìä RESUMEN DE LIMPIEZA
  Registros iniciales:    100,698
  Registros finales:      99,458 (98.8%)
  Registros descartados:  1,240 (1.2%)

Liberando memoria del cache anterior...
‚úÖ Limpieza¬†completada



In [20]:
# 1. Entidades
print("Top 5 entidades:")
df_silver.groupBy("entidad").count().orderBy(desc("count")).show(5, truncate=False)

# 2. Departamentos
print("\nTop 5 departamentos:")
df_silver.groupBy("departamento").count().orderBy(desc("count")).show(5, truncate=False)

# 3. Regi√≥n
print("\nDistribuci√≥n por regi√≥n:")
df_silver.groupBy("region").count().orderBy(desc("count")).show(truncate=False)

# 4. C√≥digo UNSPSC
print("\nTop 10 c√≥digos UNSPSC:")
df_silver.groupBy("codigo_unspsc").count().orderBy(desc("count")).show(10, truncate=False)

# 5. Categor√≠a UNSPSC
print("\nTop 10 categor√≠as UNSPSC:")
df_silver.groupBy("descripcion_categoria").count().orderBy(desc("count")).show(10, truncate=False)

# 6. Tipo de contrato
print("\nDistribuci√≥n por tipo de contrato:")
df_silver.groupBy("tipo_contrato").count().orderBy(desc("count")).show(truncate=False)

# 7. Estado del contrato
print("\nDistribuci√≥n del estado del contrato:")
df_silver.groupBy("estado_contrato").count().orderBy(desc("count")).show(truncate=False)

# 8. Modalidad de contrataci√≥n
print("\nTop 10 modalidades de contrataci√≥n:")
df_silver.groupBy("modalidad").count().orderBy(desc("count")).show(10, truncate=False)


Top 5 entidades:
+-------------------------------------------------+-----+
|entidad                                          |count|
+-------------------------------------------------+-----+
|MUNICIPIO DE SOACHA.                             |6356 |
|ALCALD√çA MUNICIPAL COTA                          |3988 |
|ESE MUNICIPAL DE SOACHA JULIO CESAR PE√ëALOZA*    |3822 |
|CUNDINAMARCA-ALCALDIA MUNICIPIO MOSQUERA         |3759 |
|empresa social del estado regi√≥n de salud soacha.|3152 |
+-------------------------------------------------+-----+
only showing top 5 rows


Top 5 departamentos:
+------------+-----+
|departamento|count|
+------------+-----+
|Cundinamarca|99458|
+------------+-----+


Distribuci√≥n por regi√≥n:
+--------------+-----+
|region        |count|
+--------------+-----+
|Centro-Oriente|99458|
+--------------+-----+


Top 10 c√≥digos UNSPSC:
+-------------+-----+
|codigo_unspsc|count|
+-------------+-----+
|             |50058|
|V1.80111600  |11391|
|V1.80111701  |4329 |
|V1.

In [22]:
from pyspark.sql.functions import min, max, avg, stddev, expr

# 10. Valor del contrato
print("\nEstad√≠sticas de valor_contrato:")
df_silver.select(
    min("valor_contrato").alias("min"),
    max("valor_contrato").alias("max"),
    avg("valor_contrato").alias("mean"),
    stddev("valor_contrato").alias("std")
).show()

# Percentiles
print("\nPercentiles de valor_contrato:")
df_silver.approxQuantile("valor_contrato", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)

# 11. Duraci√≥n en d√≠as
print("\nEstad√≠sticas de duracion_dias:")
df_silver.select(
    min("duracion_dias").alias("min"),
    max("duracion_dias").alias("max"),
    avg("duracion_dias").alias("mean"),
    stddev("duracion_dias").alias("std")
).show()

print("\nPercentiles de duracion_dias:")
df_silver.approxQuantile("duracion_dias", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)



Estad√≠sticas de valor_contrato:
+---+----------------+-------------------+--------------------+
|min|             max|               mean|                 std|
+---+----------------+-------------------+--------------------+
|1.0|1.50838540149E11|9.941466321590018E7|1.1521186504414532E9|
+---+----------------+-------------------+--------------------+


Percentiles de valor_contrato:

Estad√≠sticas de duracion_dias:
+---+----+-----------------+------------------+
|min| max|             mean|               std|
+---+----+-----------------+------------------+
|  0|4297|82.47422012591348|101.20091534465666|
+---+----+-----------------+------------------+


Percentiles de duracion_dias:


[0.0, 6.0, 40.0, 125.0, 4297.0]

In [24]:
print("\nTop a√±os:")
df_silver.groupBy("anno").count().orderBy(desc("anno")).show(10, truncate=False)

print("\nContratos por a√±o:")
df_silver.groupBy("anno").count().orderBy(desc("count")).show(10, truncate=False)

print("\nTop fechas de firma:")
df_silver.groupBy("fecha_firma").count().orderBy(desc("count")).show(10, truncate=False)



Top a√±os:
+----+-----+
|anno|count|
+----+-----+
|2025|68   |
|2024|95797|
|2023|3029 |
|2022|564  |
+----+-----+


Contratos por a√±o:
+----+-----+
|anno|count|
+----+-----+
|2024|95797|
|2023|3029 |
|2022|564  |
|2025|68   |
+----+-----+


Top fechas de firma:
+-----------+-----+
|fecha_firma|count|
+-----------+-----+
|2024-02-01 |1230 |
|2024-03-01 |1119 |
|2024-02-02 |860  |
|2024-02-05 |815  |
|2024-03-22 |794  |
|2024-02-09 |789  |
|2024-02-16 |744  |
|2024-02-06 |723  |
|2024-02-12 |715  |
|2024-09-02 |686  |
+-----------+-----+
only showing top 10 rows



In [25]:
# ============================================================================
# 5. GUARDAR EN DELTA LAKE
# ============================================================================

print("="*80)
print("PASO 5: GUARDAR EN DELTA LAKE")
print("="*80 + "\n")

DELTA_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"üíæ Guardando en: {DELTA_PATH}")

df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(DELTA_PATH)

print("‚úÖ Guardado exitosamente\n")

# ‚ö†Ô∏è LIBERAR todo
df_silver.unpersist()
spark.catalog.clearCache()
 

PASO 5: GUARDAR EN DELTA LAKE

üíæ Guardando en: /app/notebooks/delta_lake/silver_contracts


                                                                                

‚úÖ Guardado exitosamente



## Fase 3

In [9]:
print("\n" + "="*80)
print("PASO 1: CARGAR DATOS DESDE SILVER")
print("="*80 + "\n")

SILVER_PATH = "/app/notebooks/delta_lake/silver_contracts"
print(f"üìä Cargando: {SILVER_PATH}")

df_silver = spark.read.format("delta").load(SILVER_PATH)
df_silver = df_silver.cache()
total_records = df_silver.count()

print(f"‚úì Registros: {total_records:,}\n")


PASO 1: CARGAR DATOS DESDE SILVER

üìä Cargando: /app/notebooks/delta_lake/silver_contracts
‚úì Registros: 99,706



In [5]:
# ============================================================================
# PASO 2: LIMPIEZA DE TEXTO
# ============================================================================

print("="*80)
print("PASO 2: LIMPIEZA DE TEXTO")
print("="*80 + "\n")

src_chars = "√°√©√≠√≥√∫√º√±"
dst_chars = "aeiouun"

df_prepared = df_silver.withColumn(
    "objeto_limpio",
    trim(
        regexp_replace(
            regexp_replace(
                translate(
                    lower(col("objeto_contrato")),
                    src_chars,
                    dst_chars
                ),
                "[^a-z0-9\\s]", " "
            ),
            "\\s+", " "
        )
    )
).filter(length(col("objeto_limpio")) >= 10)

print(f"‚úì Registros despu√©s de limpieza: {df_prepared.count():,}\n")


PASO 2: LIMPIEZA DE TEXTO





‚úì Registros despu√©s de limpieza: 99,706



                                                                                

In [6]:
# ============================================================================
# PASO 3: TOKENIZACI√ìN Y STOPWORDS
# ============================================================================

print("="*80)
print("PASO 3: TOKENIZACI√ìN")
print("="*80 + "\n")

stopwords_es = [
    "el", "la", "de", "que", "y", "a", "en", "un", "ser", "se", "no",
    "por", "con", "su", "para", "como", "estar", "tener", "le", "lo",
    "pero", "hacer", "o", "este", "otro", "ese", "si", "ya", "ver",
    "dar", "muy", "sin", "sobre", "tambi√©n", "hasta", "a√±o", "entre",
    "del", "al", "los", "las", "uno", "una", "unos", "unas",
    "contrato", "contratos", "objeto", "prestacion", "prestaci√≥n",
    "servicio", "servicios", "suministro", "ejecucion", "ejecuci√≥n"
]

tokenizer = Tokenizer(inputCol="objeto_limpio", outputCol="palabras")
df_tokenized = tokenizer.transform(df_prepared)

remover = StopWordsRemover(
    inputCol="palabras",
    outputCol="palabras_sin_stopwords",
    stopWords=stopwords_es
)
df_filtered_words = remover.transform(df_tokenized)

# Filtrar palabras cortas
def clean_words(words):
    if not words:
        return []
    return [w for w in words if len(w) >= 3]

clean_udf = udf(clean_words, ArrayType(StringType()))

df_filtered = df_filtered_words.withColumn(
    "palabras_filtradas",
    clean_udf(col("palabras_sin_stopwords"))
).filter(size(col("palabras_filtradas")) > 0)

print(f"‚úì Registros despu√©s de filtrado: {df_filtered.count():,}\n")


PASO 3: TOKENIZACI√ìN





‚úì Registros despu√©s de filtrado: 99,706



                                                                                

In [7]:
# ============================================================================
# PASO 4: WORD2VEC
# ============================================================================

print("="*80)
print("PASO 4: WORD2VEC")
print("="*80 + "\n")

word2vec = Word2Vec(
    vectorSize=100,
    minCount=2,
    maxIter=10,
    seed=42,
    inputCol="palabras_filtradas",
    outputCol="embedding_raw"
)

print("‚è≥ Entrenando Word2Vec...")
word2vec_model = word2vec.fit(df_filtered)
df_embeddings = word2vec_model.transform(df_filtered)

vocab_size = len(word2vec_model.getVectors().collect())
print(f"‚úì Vocabulario: {vocab_size:,} palabras")
print(f"‚úì Embeddings¬†generados\n")


PASO 4: WORD2VEC

‚è≥ Entrenando Word2Vec...


                                                                                

‚úì Vocabulario: 14,468 palabras
‚úì Embeddings¬†generados



In [10]:
# ----------------------------------------------------------------
# 3. Transformaciones INDEPENDIENTES del target
# ----------------------------------------------------------------
print("\n2. Transformaciones categ√≥ricas (sin target)...")

from pyspark.ml.feature import StringIndexer, OneHotEncoder
import pyspark.sql.functions as F

# A. OneHot para variables de baja cardinalidad
low_card_cols = ["tipo_contrato", "estado_contrato", "modalidad"]

for col_name in low_card_cols:
    print(f"   OneHot para {col_name}...")
    
    indexer = StringIndexer(
        inputCol=col_name,
        outputCol=f"{col_name}_idx",
        handleInvalid="keep"
    )
    df_embeddings = indexer.fit(df_embeddings).transform(df_embeddings)
    
    encoder = OneHotEncoder(
        inputCol=f"{col_name}_idx",
        outputCol=f"{col_name}_ohe",
        dropLast=True
    )
    df_embeddings = encoder.fit(df_embeddings).transform(df_embeddings)

# B. Frequency Encoding para 'entidad'
print("\n   Frequency Encoding para 'entidad'...")
entidad_freq = df_embeddings.groupBy("entidad").count()
total_count = df_embeddings.count()
entidad_freq = entidad_freq.withColumn(
    "entidad_freq",
    col("count") / total_count
).select("entidad", "entidad_freq")

df_embeddings = df_embeddings.join(entidad_freq, "entidad", "left")

# C. Eliminar variables sin varianza
df_embeddings = df_embeddings.drop("departamento", "region")



2. Transformaciones categ√≥ricas (sin target)...
   OneHot para tipo_contrato...


                                                                                

   OneHot para estado_contrato...


                                                                                

   OneHot para modalidad...


                                                                                


   Frequency Encoding para 'entidad'...


                                                                                

In [11]:
# ----------------------------------------------------------------
# 4. DIVISI√ìN TEMPORAL (80/20)
# ----------------------------------------------------------------
print("\n3. Divisi√≥n temporal train/test...")

from datetime import datetime

# Convertir fecha para calcular percentil
df_temp = df_embeddings.withColumn(
    "fecha_num",
    col("fecha_firma").cast("timestamp").cast("long")
)

# Calcular percentil 80
q = df_temp.approxQuantile("fecha_num", [0.8], 0.01)
split_ts = q[0]
split_date = datetime.utcfromtimestamp(split_ts)

print(f"   Fecha de corte: {split_date}")

# Crear datasets de train y test
df_train_raw = df_embeddings.filter(col("fecha_firma") <= split_date)
df_test_raw = df_embeddings.filter(col("fecha_firma") > split_date)

print(f"   Train: {df_train_raw.count():,} registros")
print(f"   Test:  {df_test_raw.count():,} registros")



3. Divisi√≥n temporal train/test...


                                                                                

   Fecha de corte: 2024-09-30 00:00:00


                                                                                

   Train: 79,800 registros




   Test:  19,906 registros


                                                                                

In [12]:
# ----------------------------------------------------------------
# 5. Guardar datasets en Delta Lake
# ----------------------------------------------------------------
print("\n4. Guardando datasets en Delta Lake...")

# Ruta para los datasets preprocesados
TRAIN_RAW_PATH = "/app/notebooks/delta_lake/train_raw_v3"
TEST_RAW_PATH = "/app/notebooks/delta_lake/test_raw_v3"

# Guardar train
df_train_raw.write.format("delta").mode("overwrite").save(TRAIN_RAW_PATH)

# Guardar test
df_test_raw.write.format("delta").mode("overwrite").save(TEST_RAW_PATH)

print(f"   ‚úì Train guardado en: {TRAIN_RAW_PATH}")
print(f"   ‚úì Test guardado en:  {TEST_RAW_PATH}")

# ----------------------------------------------------------------
# 6. Guardar tambi√©n los modelos de transformaci√≥n
# ----------------------------------------------------------------
print("\n5. Guardando modelos de transformaci√≥n...")

MODELS_PATH = "/app/notebooks/models_v3"

# Guardar Word2Vec model (si lo tienes)
if 'word2vec_model' in locals():
    word2vec_model.save(f"{MODELS_PATH}/word2vec_model")

# Guardar StringIndexer models para referencia
for col_name in low_card_cols:
    indexer_model_path = f"{MODELS_PATH}/indexer_{col_name}"
    # Necesitar√≠as extraer el modelo del pipeline o guardar los mapeos

print("Modelos guardados")



4. Guardando datasets en Delta Lake...


                                                                                

   ‚úì Train guardado en: /app/notebooks/delta_lake/train_raw_v3
   ‚úì Test guardado en:  /app/notebooks/delta_lake/test_raw_v3

5. Guardando modelos de transformaci√≥n...


                                                                                

Modelos guardados


In [None]:
# ----------------------------------------------------------------
# 7. Informe final
# ----------------------------------------------------------------
print("\n" + "="*80)
print("RESUMEN FASE 3")
print("="*80)

print(f"‚úÖ Preprocesamiento completado")
print(f"üìä Train: {df_train_raw.count():,} registros")
print(f"üìä Test:  {df_test_raw.count():,} registros")
print()
print("üéØ Variables disponibles:")
print(f"  - embedding_raw: Word2Vec embeddings")
for col in low_card_cols:
    print(f"  - {col}_ohe: OneHot encoded")
print(f"  - entidad_freq: Frequency encoding")
print(f"  - valor_contrato: Target variable")
print(f"  - duracion_dias: Variable num√©rica")
print()
print("üìà Listo para Fase 4: Target Encoding y Modelado")

## Fase 4

In [2]:
import builtins  # <-- IMPORTANTE: Importar builtins
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# ================================================================
# FASE 4: MODELADO Y REGISTRO EN MLFLOW (CON LOG TRANSFORM)
# ================================================================

print("="*80)
print("FASE 4: MODELADO CON TRANSFORMACI√ìN LOGAR√çTMICA")
print("="*80)

# ----------------------------------------------------------------
# 1. Cargar datasets preprocesados
# ----------------------------------------------------------------
print("1. Cargando datasets preprocesados...")

TRAIN_RAW_PATH = "/app/notebooks/delta_lake/train_raw_v3"
TEST_RAW_PATH = "/app/notebooks/delta_lake/test_raw_v3"

train_raw = spark.read.format("delta").load(TRAIN_RAW_PATH).cache()
test_raw = spark.read.format("delta").load(TEST_RAW_PATH).cache()

print(f"   ‚úì Train: {train_raw.count():,} registros")
print(f"   ‚úì Test:  {test_raw.count():,} registros")


FASE 4: MODELADO CON TRANSFORMACI√ìN LOGAR√çTMICA
1. Cargando datasets preprocesados...


                                                                                

   ‚úì Train: 79,800 registros




   ‚úì Test:  19,906 registros


                                                                                

In [4]:
# ----------------------------------------------------------------
# 2. TRANSFORMACI√ìN LOGAR√çTMICA DEL TARGET
# ----------------------------------------------------------------
import pyspark.sql.functions as F
print("\n2. Aplicando transformaci√≥n logar√≠tmica al target...")

# Usamos log1p = log(1 + x) para evitar problemas con valores peque√±os
train_data = train_raw.withColumn("log_valor_contrato", F.log1p(col("valor_contrato")))
test_data = test_raw.withColumn("log_valor_contrato", F.log1p(col("valor_contrato")))

# Verificar estad√≠sticas antes/despu√©s
print("   Estad√≠sticas del target original vs logar√≠tmico:")
train_stats = train_data.select(
    F.mean("valor_contrato").alias("mean_original"),
    F.stddev("valor_contrato").alias("std_original"),
    F.mean("log_valor_contrato").alias("mean_log"),
    F.stddev("log_valor_contrato").alias("std_log")
).first()

print(f"   Original: mean=${train_stats['mean_original']:,.2f}, std=${train_stats['std_original']:,.2f}")
print(f"   Log: mean={train_stats['mean_log']:.2f}, std={train_stats['std_log']:.2f}")


2. Aplicando transformaci√≥n logar√≠tmica al target...
   Estad√≠sticas del target original vs logar√≠tmico:
   Original: mean=$81,720,270.17, std=$983,090,380.87
   Log: mean=16.73, std=1.12


In [5]:
# ----------------------------------------------------------------
# 3. Target Encoding usando el TARGET LOGAR√çTMICO
# ----------------------------------------------------------------
print("\n3. Target Encoding para 'codigo_unspsc' (usando target log)...")

def safe_target_encoding_log(train_df, test_df, cat_col, target_log_col="log_valor_contrato", m=50):
    """
    Target encoding usando el target en escala logar√≠tmica
    """
    # Calcular media global del LOG en train
    global_mean_log = train_df.agg(F.mean(target_log_col)).first()[0]
    
    # Calcular estad√≠sticas por categor√≠a en train (usando LOG)
    stats = train_df.groupBy(cat_col).agg(
        F.mean(target_log_col).alias("cat_mean_log"),
        F.count(target_log_col).alias("cat_count")
    )
    
    # Aplicar smoothing en escala logar√≠tmica
    stats = stats.withColumn(
        f"{cat_col}_te_log",
        (F.col("cat_count") * F.col("cat_mean_log") + m * global_mean_log) / 
        (F.col("cat_count") + m)
    ).select(cat_col, f"{cat_col}_te_log")
    
    # Aplicar a train
    train_encoded = train_df.join(stats, cat_col, "left")
    
    # Aplicar a test
    test_encoded = test_df.join(stats, cat_col, "left")
    
    # Para categor√≠as no vistas en train, usar global mean log
    test_encoded = test_encoded.fillna({f"{cat_col}_te_log": global_mean_log})
    
    return train_encoded, test_encoded

# Aplicar target encoding con target logar√≠tmico
train_data, test_data = safe_target_encoding_log(
    train_data, test_data,
    cat_col="codigo_unspsc",
    target_log_col="log_valor_contrato",
    m=50
)

print("   ‚úì codigo_unspsc_te_log creado (en escala log)")


3. Target Encoding para 'codigo_unspsc' (usando target log)...
   ‚úì codigo_unspsc_te_log creado (en escala log)


In [6]:
# ----------------------------------------------------------------
# 4. Ensamblar features (incluyendo el target encoding logar√≠tmico)
# ----------------------------------------------------------------
print("\n4. Ensamblando features...")

feature_cols = [
    "embedding_raw",
    "tipo_contrato_ohe",
    "estado_contrato_ohe", 
    "modalidad_ohe",
    "entidad_freq",
    "codigo_unspsc_te_log",  # ¬°Usamos la versi√≥n logar√≠tmica!
]

if "duracion_dias" in train_data.columns:
    train_data = train_data.fillna({"duracion_dias": 0})
    test_data = test_data.fillna({"duracion_dias": 0})
    feature_cols.append("duracion_dias")

print(f"   Features: {len(feature_cols)} dimensiones")
for feat in feature_cols:
    print(f"   - {feat}")

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw",
    handleInvalid="skip"
)

train_features = assembler.transform(train_data)
test_features = assembler.transform(test_data)



4. Ensamblando features...
   Features: 7 dimensiones
   - embedding_raw
   - tipo_contrato_ohe
   - estado_contrato_ohe
   - modalidad_ohe
   - entidad_freq
   - codigo_unspsc_te_log
   - duracion_dias


In [7]:
# ----------------------------------------------------------------
# 5. Normalizaci√≥n (CON VALIDACI√ìN)
# ----------------------------------------------------------------
print("\n5. Normalizando y validando...")

scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features_scaled",
    withStd=True,
    withMean=True
)

scaler_model = scaler.fit(train_features)
train_scaled = scaler_model.transform(train_features)
test_scaled = scaler_model.transform(test_features)


5. Normalizando y validando...


                                                                                

In [12]:
# ----------------------------------------------------------------
# 6. PCA (CORREGIDO)
# ----------------------------------------------------------------
import builtins
print("\n6. Aplicando PCA...")

initial_dims = len(train_scaled.select("features_scaled").first()[0])

# CORRECCI√ìN: Usar builtins.min en lugar de min
pca = PCA(
    k=builtins.min(50, initial_dims),  # <-- ¬°CORRECCI√ìN AQU√ç!
    inputCol="features_scaled", 
    outputCol="features"
)

pca_model = pca.fit(train_scaled)
train_pca = pca_model.transform(train_scaled)
test_pca = pca_model.transform(test_scaled)

# Analizar varianza
explained_variance = pca_model.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)
n_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f"   Componentes para 95% varianza: {n_95}")
print(f"   Reducci√≥n: {initial_dims} ‚Üí {n_95} dimensiones")


6. Aplicando PCA...


                                                                                

   Componentes para 95% varianza: 1
   Reducci√≥n: 141 ‚Üí 1 dimensiones


In [22]:
# ================================================================
# DIAGN√ìSTICO CORREGIDO - VERSI√ìN SIN CONFLICTOS
# ================================================================

import builtins  # Para usar min() y abs() de Python sin conflictos
import numpy as np

print("\nüîç DIAGN√ìSTICO DEL PCA Y FEATURES (VERSI√ìN CORREGIDA):")
print("="*80)

# ----------------------------------------------------------------
# 1. Verificar el tipo de dato de features_scaled
# ----------------------------------------------------------------
print("1. Verificando tipo de datos de features_scaled...")

# Obtener schema
feature_schema = train_scaled.schema["features_scaled"].dataType
print(f"   Tipo: {feature_schema}")
print(f"   Es vector: {feature_schema.typeName() == 'vector'}")

# ----------------------------------------------------------------
# 2. Verificar dimensiones correctamente
# ----------------------------------------------------------------
print("\n2. Verificando dimensiones...")
initial_dims = len(train_scaled.select("features_scaled").first()[0])
print(f"   Dimensiones iniciales: {initial_dims}")

# ----------------------------------------------------------------
# 3. Calcular estad√≠sticas MANUALMENTE (sin Summarizer problem√°tico)
# ----------------------------------------------------------------
print("\n3. Calculando estad√≠sticas b√°sicas...")

# Tomar una muestra para an√°lisis - USANDO builtins.min()
total_count = train_scaled.count()
sample_size = builtins.min(1000, total_count)  # Usar builtins.min() expl√≠citamente
print(f"   Total registros: {total_count}")
print(f"   Tomando muestra de: {sample_size} registros")

sample_df = train_scaled.select("features_scaled").limit(sample_size).collect()

# Convertir a numpy array
sample_vectors = np.array([row["features_scaled"].toArray() for row in sample_df])

# Calcular estad√≠sticas en la muestra
mean_per_feature = np.mean(sample_vectors, axis=0)
variance_per_feature = np.var(sample_vectors, axis=0)

print(f"   Muestra analizada: {len(sample_vectors)} registros")
print(f"   Media (primeras 5 features): {mean_per_feature[:5]}")
print(f"   Varianza (primeras 5 features): {variance_per_feature[:5]}")

# Contar features con varianza cercana a cero
low_variance_count = np.sum(variance_per_feature < 1e-6)
print(f"   Features con varianza < 1e-6: {low_variance_count}/{initial_dims}")

# ----------------------------------------------------------------
# 4. Verificar si hay valores NaN o Inf
# ----------------------------------------------------------------
print("\n4. Verificando valores problem√°ticos...")

nan_count = np.sum(np.isnan(sample_vectors))
inf_count = np.sum(np.isinf(sample_vectors))

print(f"   Valores NaN en muestra: {nan_count}")
print(f"   Valores Inf en muestra: {inf_count}")

# ----------------------------------------------------------------
# 5. Verificar escalado - deber√≠a tener media ~0 y varianza ~1
# ----------------------------------------------------------------
print("\n5. Verificando calidad del escalado...")

mean_abs = np.abs(mean_per_feature).mean()
variance_mean = variance_per_feature.mean()

print(f"   Media absoluta de medias: {mean_abs:.6f} (deber√≠a ser ~0)")
print(f"   Promedio de varianzas: {variance_mean:.6f} (deber√≠a ser ~1)")

if mean_abs > 0.1:
    print("   ‚ö†Ô∏è  El escalado podr√≠a no estar centrando bien (media lejos de 0)")
if builtins.abs(variance_mean - 1.0) > 0.1:  # Usar builtins.abs()
    print(f"   ‚ö†Ô∏è  El escalado podr√≠a no estar normalizando bien (varianza promedio={variance_mean:.2f})")

# ----------------------------------------------------------------
# 6. DIAGN√ìSTICO DEL PCA - LA PARTE IMPORTANTE
# ----------------------------------------------------------------
print("\n6. Diagn√≥stico del PCA (CORRECTO)...")

# Ejecutar PCA con todos los componentes primero para ver varianza explicada
print("   Ejecutando PCA con todos los componentes para an√°lisis...")

pca_full = PCA(k=initial_dims, inputCol="features_scaled", outputCol="features_pca_full")
pca_model_full = pca_full.fit(train_scaled)
explained_variance = pca_model_full.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)

print(f"\n   Varianza explicada por componentes principales:")
for i in range(builtins.min(10, len(explained_variance))):  # Usar builtins.min()
    print(f"   PC{i+1}: {explained_variance[i]:.4f} ({explained_variance[i]:.2%})")

# Encontrar n√∫mero de componentes para diferentes umbrales
thresholds = [0.50, 0.80, 0.90, 0.95, 0.99]
print(f"\n   Componentes necesarios para diferentes umbrales:")
for threshold in thresholds:
    n_components = np.argmax(cumulative_variance >= threshold) + 1
    print(f"   {threshold:.0%} varianza: {n_components} componentes")

n_95 = np.argmax(cumulative_variance >= 0.95) + 1
print(f"\n   Componentes para 95% varianza: {n_95}")

# ----------------------------------------------------------------
# 7. AN√ÅLISIS DE POR QU√â PCA DA SOLO 1 COMPONENTE
# ----------------------------------------------------------------
print("\n7. An√°lisis de causa ra√≠z...")

if n_95 == 1:
    print("   üîç PROBLEMA DETECTADO: Solo 1 componente para 95% varianza")
    print("\n   Posibles causas:")
    
    # Causa 1: Primera componente domina
    first_pc_variance = explained_variance[0] if len(explained_variance) > 0 else 0
    print(f"   1. Primera componente muy dominante: {first_pc_variance:.2%}")
    
    # Causa 2: Features con varianza muy desigual
    variance_max = variance_per_feature.max()
    variance_min = variance_per_feature.min()
    variance_ratio = variance_max / variance_min if variance_min > 0 else float('inf')
    print(f"   2. Ratio m√°ximo/min de varianza: {variance_ratio:.2f}")
    
    # Causa 3: Alta correlaci√≥n entre features
    # Calcular correlaciones en muestra
    if sample_vectors.shape[1] > 1:
        corr_matrix = np.corrcoef(sample_vectors, rowvar=False)
        np.fill_diagonal(corr_matrix, 0)  # Quitar diagonal
        max_corr = np.max(np.abs(corr_matrix))
        high_corr_count = np.sum(np.abs(corr_matrix) > 0.9)
        total_corr_pairs = (corr_matrix.shape[0] * (corr_matrix.shape[0] - 1)) / 2
        print(f"   3. Correlaci√≥n m√°xima entre features: {max_corr:.3f}")
        print(f"      Pares con |corr| > 0.9: {high_corr_count}/{int(total_corr_pairs)}")

# ----------------------------------------------------------------
# 8. SOLUCI√ìN RECOMENDADA
# ----------------------------------------------------------------
print("\n" + "="*80)
print("SOLUCI√ìN RECOMENDADA:")
print("="*80)

if n_95 == 1:
    print("""
‚ö†Ô∏è  PROBLEMA CR√çTICO DETECTADO:
   
   PCA est√° capturando el 95% de la varianza con solo 1 componente.
   Esto significa que:
   1. Hay una feature o combinaci√≥n que domina completamente
   2. Las dem√°s features aportan muy poca informaci√≥n
   3. Posible problema con el escalado o preparaci√≥n de features

üîß ACCIONES RECOMENDADAS:

   1. VERIFICAR EL TARGET ENCODING:
      - codigo_unspsc_te_log podr√≠a estar dominando
      - Verificar su varianza relativa a otras features
      
   2. REVISAR WORD2VEC EMBEDDINGS:
      - embedding_raw (100 dimensiones) podr√≠a tener varianza muy baja
      - Word2Vec puede generar embeddings con poca variabilidad
      
   3. PROBAR SIN PCA INICIALMENTE:
      - Usar todas las features escaladas directamente
      - Los √°rboles (RandomForest, GBT) manejan bien multicolinealidad
      
   4. VERIFICAR ONE-HOT ENCODINGS:
      - tipo_contrato_ohe, estado_contrato_ohe, modalidad_ohe
      - Podr√≠an tener muchas dimensiones con poca variabilidad
""")
else:
    print(f"""
‚úÖ PCA PARECE RAZONABLE:
   
   Se necesitan {n_95} componentes para 95% varianza.
   Esto sugiere una distribuci√≥n saludable de informaci√≥n.
   
üîß ACCI√ìN RECOMENDADA:
   
   Probar con {n_95} componentes PCA y tambi√©n sin PCA para comparar.
""")


üîç DIAGN√ìSTICO DEL PCA Y FEATURES (VERSI√ìN CORREGIDA):
1. Verificando tipo de datos de features_scaled...
   Tipo: VectorUDT()
   Es vector: False

2. Verificando dimensiones...


                                                                                

   Dimensiones iniciales: 141

3. Calculando estad√≠sticas b√°sicas...


                                                                                

   Total registros: 79800
   Tomando muestra de: 1000 registros
   Muestra analizada: 1000 registros
   Media (primeras 5 features): [ 0.26052261 -0.16667773 -0.12854223 -0.01215937 -0.11279548]
   Varianza (primeras 5 features): [0.84983428 1.04320726 1.40261202 1.04923529 0.85028533]
   Features con varianza < 1e-6: 10/141

4. Verificando valores problem√°ticos...
   Valores NaN en muestra: 0
   Valores Inf en muestra: 0

5. Verificando calidad del escalado...
   Media absoluta de medias: 0.129635 (deber√≠a ser ~0)
   Promedio de varianzas: 1.720219 (deber√≠a ser ~1)
   ‚ö†Ô∏è  El escalado podr√≠a no estar centrando bien (media lejos de 0)
   ‚ö†Ô∏è  El escalado podr√≠a no estar normalizando bien (varianza promedio=1.72)

6. Diagn√≥stico del PCA (CORRECTO)...
   Ejecutando PCA con todos los componentes para an√°lisis...


                                                                                


   Varianza explicada por componentes principales:
   PC1: 0.0676 (6.76%)
   PC2: 0.0486 (4.86%)
   PC3: 0.0405 (4.05%)
   PC4: 0.0383 (3.83%)
   PC5: 0.0278 (2.78%)
   PC6: 0.0250 (2.50%)
   PC7: 0.0211 (2.11%)
   PC8: 0.0205 (2.05%)
   PC9: 0.0191 (1.91%)
   PC10: 0.0173 (1.73%)

   Componentes necesarios para diferentes umbrales:
   50% varianza: 24 componentes
   80% varianza: 64 componentes
   90% varianza: 87 componentes
   95% varianza: 104 componentes
   99% varianza: 126 componentes

   Componentes para 95% varianza: 104

7. An√°lisis de causa ra√≠z...

SOLUCI√ìN RECOMENDADA:

‚úÖ PCA PARECE RAZONABLE:

   Se necesitan 104 componentes para 95% varianza.
   Esto sugiere una distribuci√≥n saludable de informaci√≥n.

üîß ACCI√ìN RECOMENDADA:

   Probar con 104 componentes PCA y tambi√©n sin PCA para comparar.



In [8]:
# ----------------------------------------------------------------
# 6. OMITIR PCA (usar features escaladas directamente)
# ----------------------------------------------------------------
print("\n6. Omitiendo PCA - usando features escaladas directamente")

# Usar las features escaladas directamente
train_final = train_scaled.select(
    col("log_valor_contrato").alias("label_log"),
    col("features_scaled").alias("features"),
    col("id_contrato"),
    col("fecha_firma"),
    col("valor_contrato")
).cache()

test_final = test_scaled.select(
    col("log_valor_contrato").alias("label_log"),
    col("features_scaled").alias("features"),
    col("id_contrato"),
    col("fecha_firma"),
    col("valor_contrato")
).cache()

dimensiones = len(train_final.select("features").first()[0])
print(f"   Dimensiones finales: {dimensiones}")
print(f"   Train: {train_final.count():,} registros")
print(f"   Test:  {test_final.count():,} registros")



6. Omitiendo PCA - usando features escaladas directamente


                                                                                

   Dimensiones finales: 141
   Train: 79,800 registros


                                                                                

   Test:  19,906 registros


In [None]:
# ----------------------------------------------------------------
# 7. PROBAR M√öLTIPLES MODELOS (SIN PCA)
# ----------------------------------------------------------------
print("\n7. Probando m√∫ltiples modelos (sin PCA)...")

models_config = [
    {
        "name": "LinearRegression",
        "model": LinearRegression(
            featuresCol="features",
            labelCol="label_log",
            maxIter=100,
            regParam=0.1,
            elasticNetParam=0.8
        )
    },
    {
        "name": "RandomForest",
        "model": RandomForestRegressor(
            featuresCol="features",
            labelCol="label_log",
            numTrees=50,
            maxDepth=10,
            seed=42
        )
    },
    {
        "name": "GBT",
        "model": GBTRegressor(
            featuresCol="features",
            labelCol="label_log",
            maxIter=50,
            maxDepth=5,
            seed=42
        )
    },
    {
        "name": "DecisionTree",
        "model": DecisionTreeRegressor(
            featuresCol="features",
            labelCol="label_log",
            maxDepth=10,
            seed=42
        )
    }
]

results = []

for config in models_config:
    print(f"\n   üîç Entrenando {config['name']}...")
    
    try:
        model = config["model"]
        
        # Entrenar
        trained_model = model.fit(train_final)
        
        # Predecir
        predictions_log = trained_model.transform(test_final)
        
        # Convertir a escala original
        predictions_final = predictions_log.withColumn(
            "prediction_original", 
            F.expm1(col("prediction"))
        ).withColumnRenamed("prediction", "prediction_log")
        
        # Calcular m√©tricas
        evaluator_log = RegressionEvaluator(
            labelCol="label_log",
            predictionCol="prediction_log",
            metricName="r2"
        )
        
        evaluator_original = RegressionEvaluator(
            labelCol="valor_contrato",
            predictionCol="prediction_original",
            metricName="r2"
        )
        
        evaluator_rmse = RegressionEvaluator(
            labelCol="valor_contrato",
            predictionCol="prediction_original",
            metricName="rmse"
        )
        
        evaluator_mae = RegressionEvaluator(
            labelCol="valor_contrato",
            predictionCol="prediction_original",
            metricName="mae"
        )
        
        r2_log = evaluator_log.evaluate(predictions_final)
        r2_original = evaluator_original.evaluate(predictions_final)
        rmse_original = evaluator_rmse.evaluate(predictions_final)
        mae_original = evaluator_mae.evaluate(predictions_final)
        
        # Guardar resultados
        results.append({
            "model": config["name"],
            "r2_log": r2_log,
            "r2_original": r2_original,
            "rmse_original": rmse_original,
            "mae_original": mae_original,
            "trained_model": trained_model,
            "predictions": predictions_final
        })
        
        print(f"      ‚úì R¬≤ (log): {r2_log:.4f}")
        print(f"      ‚úì R¬≤ (original): {r2_original:.4f}")
        print(f"      ‚úì RMSE: ${rmse_original:,.2f}")
        print(f"      ‚úì MAE: ${mae_original:,.2f}")
        
    except Exception as e:
        print(f"      ‚úó Error: {str(e)[:100]}...")
        results.append({
            "model": config["name"],
            "error": str(e)
        })


7. Probando m√∫ltiples modelos (sin PCA)...

   üîç Entrenando LinearRegression...


                                                                                

      ‚úì R¬≤ (log): 0.3787
      ‚úì R¬≤ (original): 0.0181
      ‚úì RMSE: $1,654,832,456.16
      ‚úì MAE: $151,444,702.22

   üîç Entrenando RandomForest...




In [None]:
# ----------------------------------------------------------------
# 8. COMPARAR RESULTADOS
# ----------------------------------------------------------------
print("\n" + "="*80)
print("COMPARACI√ìN DE MODELOS (SIN PCA)")
print("="*80)

# Mostrar resultados en tabla
print(f"\n{'Modelo':<20} {'R¬≤ (log)':<12} {'R¬≤ (original)':<15} {'RMSE':<20} {'MAE':<20}")
print("-" * 90)

for result in results:
    if "error" not in result:
        rmse_str = f"${result['rmse_original']:,.2f}"
        mae_str = f"${result['mae_original']:,.2f}"
        print(f"{result['model']:<20} {result['r2_log']:<12.4f} {result['r2_original']:<15.4f} {rmse_str:<20} {mae_str:<20}")
    else:
        print(f"{result['model']:<20} {'ERROR':<12} {'ERROR':<15} {'ERROR':<20} {'ERROR':<20}")

# Encontrar el mejor modelo por R¬≤ original
valid_results = [r for r in results if "error" not in r]
if valid_results:
    best_model = max(valid_results, key=lambda x: x["r2_original"])
    
    print(f"\nüèÜ MEJOR MODELO: {best_model['model']}")
    print(f"   R¬≤ (original): {best_model['r2_original']:.4f}")
    print(f"   RMSE: ${best_model['rmse_original']:,.2f}")
    print(f"   MAE: ${best_model['mae_original']:,.2f}")
    
    # Guardar el mejor modelo para MLflow
    best_trained_model = best_model["trained_model"]
    best_predictions = best_model["predictions"]
    
    # Comparar con resultado anterior CON PCA
    print(f"\nüìà COMPARACI√ìN CON PCA:")
    print(f"   CON PCA (anterior): R¬≤ = -0.0043")
    print(f"   SIN PCA (ahora):    R¬≤ = {best_model['r2_original']:.4f}")
    
    if best_model['r2_original'] > -0.0043:
        print("   ‚úÖ ¬°MEJOR√çA SIGNIFICATIVA!")
    else:
        print("   ‚ö†Ô∏è  Resultado similar o peor")
else:
    print("\n‚ö†Ô∏è  No hay modelos v√°lidos para comparar")


In [13]:
# ----------------------------------------------------------------
# 7. Preparar datos para modelado (usando LOG como target)
# ----------------------------------------------------------------
print("\n7. Preparando datos para modelado (target logar√≠tmico)...")

# Entrenaremos el modelo para predecir el LOG del valor
train_final = train_pca.select(
    col("log_valor_contrato").alias("label_log"),  # Target logar√≠tmico
    col("features"),
    col("id_contrato"),
    col("fecha_firma"),
    col("valor_contrato")  # Mantenemos el original tambi√©n
)

test_final = test_pca.select(
    col("log_valor_contrato").alias("label_log"),  # Target logar√≠tmico
    col("features"),
    col("id_contrato"),
    col("fecha_firma"),
    col("valor_contrato")  # Mantenemos el original tambi√©n
)

print(f"   Train final: {train_final.count():,} registros")
print(f"   Test final:  {test_final.count():,} registros")


7. Preparando datos para modelado (target logar√≠tmico)...


                                                                                

   Train final: 79,800 registros
   Test final:  19,906 registros


In [14]:
# ----------------------------------------------------------------
# 8. Entrenar modelo para predecir LOG
# ----------------------------------------------------------------
print("\n8. Entrenando modelo para predecir log(valor_contrato)...")

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Modelo para predecir el log
lr = LinearRegression(
    featuresCol="features",
    labelCol="label_log",  # Target logar√≠tmico
    maxIter=100,
    regParam=0.1,
    elasticNetParam=0.8
)

lr_model = lr.fit(train_final)
predictions_log = lr_model.transform(test_final)


8. Entrenando modelo para predecir log(valor_contrato)...


                                                                                

In [15]:
# ----------------------------------------------------------------
# 9. Convertir predicciones logar√≠tmicas a escala original
# ----------------------------------------------------------------
print("\n9. Convirtiendo predicciones a escala original...")

# Convertir de vuelta: exp(predicci√≥n_log) - 1
predictions_final = predictions_log.withColumn(
    "prediction_original", 
    F.expm1(col("prediction"))  # exp(x) - 1
)

# Para referencia, tambi√©n mantenemos la predicci√≥n logar√≠tmica
predictions_final = predictions_final.withColumnRenamed("prediction", "prediction_log")

print("   ‚úì Predicciones convertidas a escala original")



9. Convirtiendo predicciones a escala original...
   ‚úì Predicciones convertidas a escala original


In [16]:
# ----------------------------------------------------------------
# 10. CALCULAR M√âTRICAS EN AMBAS ESCALAS
# ----------------------------------------------------------------
print("\n10. Calculando m√©tricas...")

# M√©tricas en escala logar√≠tmica
evaluator_log_rmse = RegressionEvaluator(
    labelCol="label_log", 
    predictionCol="prediction_log", 
    metricName="rmse"
)
evaluator_log_r2 = RegressionEvaluator(
    labelCol="label_log", 
    predictionCol="prediction_log", 
    metricName="r2"
)

rmse_log = evaluator_log_rmse.evaluate(predictions_final)
r2_log = evaluator_log_r2.evaluate(predictions_final)

# M√©tricas en escala original
evaluator_original_rmse = RegressionEvaluator(
    labelCol="valor_contrato", 
    predictionCol="prediction_original", 
    metricName="rmse"
)
evaluator_original_r2 = RegressionEvaluator(
    labelCol="valor_contrato", 
    predictionCol="prediction_original", 
    metricName="r2"
)
evaluator_original_mae = RegressionEvaluator(
    labelCol="valor_contrato", 
    predictionCol="prediction_original", 
    metricName="mae"
)

rmse_original = evaluator_original_rmse.evaluate(predictions_final)
r2_original = evaluator_original_r2.evaluate(predictions_final)
mae_original = evaluator_original_mae.evaluate(predictions_final)

# Calcular sigma en escala original
predictions_with_error = predictions_final.withColumn(
    "error_original", 
    col("prediction_original") - col("valor_contrato")
)
sigma_original = predictions_with_error.select(
    stddev("error_original").alias("sigma")
).first()["sigma"]

print(f"\nüìä M√âTRICAS EN ESCALA LOGAR√çTMICA:")
print(f"   RMSE (log): {rmse_log:.4f}")
print(f"   R¬≤ (log):   {r2_log:.4f}")

print(f"\nüìä M√âTRICAS EN ESCALA ORIGINAL:")
print(f"   RMSE: ${rmse_original:,.2f}")
print(f"   MAE:  ${mae_original:,.2f}")
print(f"   R¬≤:   {r2_original:.4f}")
print(f"   Sigma: ${sigma_original:,.2f}")


10. Calculando m√©tricas...

üìä M√âTRICAS EN ESCALA LOGAR√çTMICA:
   RMSE (log): 1.4145
   R¬≤ (log):   0.2997

üìä M√âTRICAS EN ESCALA ORIGINAL:
   RMSE: $1,673,553,489.93
   MAE:  $162,062,709.30
   R¬≤:   -0.0043
   Sigma: $1,667,071,830.58


In [None]:
# ----------------------------------------------------------------
# 11. REGISTRAR EN MLFLOW (TODAS LAS M√âTRICAS)
# ----------------------------------------------------------------
print("\n" + "="*80)
print("REGISTRO EN MLFLOW")
print("="*80)

import mlflow
import mlflow.spark
import json
from datetime import datetime

# Configurar MLflow
MLFLOW_TRACKING_URI = "http://172.17.0.1:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("contract_value_regression_log")

# Iniciar run de MLflow
with mlflow.start_run(run_name=f"linear_regression_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    
    # ========== 11.1 Registrar par√°metros ==========
    print("üìù Registrando par√°metros...")
    
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("target_transform", "log1p")
    mlflow.log_param("regParam", 0.1)
    mlflow.log_param("elasticNetParam", 0.8)
    mlflow.log_param("maxIter", 100)
    mlflow.log_param("train_size", train_final.count())
    mlflow.log_param("test_size", test_final.count())
    mlflow.log_param("features_count", len(feature_cols))
    mlflow.log_param("pca_components", pca.getK())
    mlflow.log_param("target_encoding_smoothing", 50)
    mlflow.log_param("target_encoding_scale", "logarithmic")
    
    print("   ‚úì Par√°metros registrados")
    
    # ========== 11.2 Registrar m√©tricas ==========
    print("üìà Registrando m√©tricas...")
    
    # M√©tricas en escala logar√≠tmica
    mlflow.log_metric("test_rmse_log", rmse_log)
    mlflow.log_metric("test_r2_log", r2_log)
    
    # M√©tricas en escala original
    mlflow.log_metric("test_rmse_original", rmse_original)
    mlflow.log_metric("test_mae_original", mae_original)
    mlflow.log_metric("test_r2_original", r2_original)
    mlflow.log_metric("sigma_original", sigma_original)
    mlflow.log_metric("anomaly_threshold_2.8sigma", 2.8 * sigma_original)
    
    # M√©tricas adicionales
    mean_valor = train_final.agg(F.mean("valor_contrato")).first()[0]
    mlflow.log_metric("rmse_to_mean_ratio", rmse_original / mean_valor)
    mlflow.log_metric("mean_valor_contrato", mean_valor)
    
    print("   ‚úì M√©tricas registradas")
    
    # ========== 11.3 Registrar modelo ==========
    print("üíæ Registrando modelo...")
    
    # Registrar el modelo que predice LOG
    mlflow.spark.log_model(
        spark_model=lr_model,
        artifact_path="model_log",
        registered_model_name="contract_value_predictor_log_v3"
    )
    
    # Tambi√©n guardar funci√≥n de conversi√≥n
    def convert_to_original_udf():
        """Funci√≥n para convertir predicciones logar√≠tmicas a originales"""
        return F.udf(lambda x: float(np.expm1(x)) if x is not None else None)
    
    print("   ‚úì Modelo registrado")
    
    # ========== 11.4 Registrar artifacts ==========
    print("üìé Registrando artifacts...")
    
    # Guardar m√©tricas en JSON
    metrics_dict = {
        "model_type": "LinearRegression",
        "target_transform": "log1p",
        "test_rmse_log": float(rmse_log),
        "test_r2_log": float(r2_log),
        "test_rmse_original": float(rmse_original),
        "test_r2_original": float(r2_original),
        "test_mae_original": float(mae_original),
        "sigma_original": float(sigma_original),
        "train_size": int(train_final.count()),
        "test_size": int(test_final.count()),
        "features": feature_cols,
        "run_timestamp": datetime.now().isoformat()
    }
    
    metrics_path = "/tmp/model_metrics_log.json"
    with open(metrics_path, 'w') as f:
        json.dump(metrics_dict, f, indent=2)
    
    mlflow.log_artifact(metrics_path, "metrics")
    
    # Guardar sample de predicciones (originales y log)
    predictions_sample = predictions_final.select(
        "id_contrato", 
        "valor_contrato", 
        "prediction_log",
        "prediction_original",
        "fecha_firma"
    ).limit(100).toPandas()
    
    predictions_path = "/tmp/predictions_sample_log.csv"
    predictions_sample.to_csv(predictions_path, index=False)
    mlflow.log_artifact(predictions_path, "predictions")
    
    print("   ‚úì Artifacts registrados")
    
    # ========== 11.5 Registrar tags ==========
    print("üè∑Ô∏è  Registrando tags...")
    
    mlflow.set_tag("framework", "PySpark")
    mlflow.set_tag("spark_version", spark.version)
    mlflow.set_tag("model_version", "v3.0_log")
    mlflow.set_tag("data_source", "contratos_publicos")
    mlflow.set_tag("target_variable", "valor_contrato")
    mlflow.set_tag("target_transform", "log1p")
    mlflow.set_tag("encoding_strategy", "hybrid_log")
    
    print("   ‚úì Tags registrados")
    
    # ========== 11.6 Informaci√≥n del run ==========
    run_id = mlflow.active_run().info.run_id
    experiment_id = mlflow.active_run().info.experiment_id
    
    print(f"\n‚úÖ RUN COMPLETADO:")
    print(f"   Run ID: {run_id}")
    print(f"   Experiment ID: {experiment_id}")
    print(f"   MLflow UI: {MLFLOW_TRACKING_URI}")
