## Fase 2

In [1]:
# ============================================================================
# FASE 2 - OPTIMIZADO PARA SPARK 3.5.1 + DELTA LAKE 3.0
# ============================================================================

# PASO 0: REINICIAR SPARK CON VERSIONES CORRECTAS
try:
    spark.stop()
except:
    pass

import time
time.sleep(3)

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, lower, regexp_replace, translate, length, trim

from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, Word2Vec, 
    StringIndexer, OneHotEncoder, VectorAssembler,
    StandardScaler, PCA
)
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
import numpy as np

spark = (
    SparkSession.builder
    .appName("Bronze_to_Silver_Optimized")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "io.delta:delta-spark_2.12:3.0.0")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "50")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print(f" Spark {spark.version} iniciado\n")


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f489ff4e-b661-45da-a176-ee1ac13ef7fc;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found io.delta#delta-spark_

 Spark 3.5.1 iniciado



In [13]:
# ============================================================================
# 1. LECTURA DE KAFKA
# ============================================================================

print("="*80)
print("PASO 1: LECTURA DE KAFKA")
print("="*80 + "\n")

contract_schema = StructType([
    StructField("id_contrato", StringType()),
    StructField("objeto_contrato", StringType()),
    StructField("entidad", StringType()),
    StructField("departamento", StringType()),
    StructField("municipio", StringType()),
    StructField("region", StringType()),
    StructField("codigo_unspsc", StringType()),
    StructField("descripcion_categoria", StringType()),
    StructField("valor_contrato", DoubleType()),
    StructField("duracion_dias", IntegerType()),
    StructField("fecha_firma", StringType()),
    StructField("tipo_contrato", StringType()),
    StructField("estado_contrato", StringType()),
    StructField("modalidad", StringType()),
    StructField("anno", IntegerType()),
    StructField("id_interno_sistema", StringType()),
    StructField("campo_vacio", StringType()),
    StructField("constante_1", StringType()),
    StructField("constante_2", IntegerType()),
    StructField("duplicate_id", StringType()),
    StructField("timestamp_carga", StringType())
])

print("Leyendo Kafka...")

df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "contratos-publicos") \
    .option("startingOffsets", "earliest") \
    .load()

df_bronze = df_kafka.select(
    from_json(col("value").cast("string"), contract_schema).alias("data")
).select("data.*")

df_bronze = df_bronze.cache()
total_kafka = df_bronze.count()

print(f" Mensajes: {total_kafka:,}\n")



PASO 1: LECTURA DE KAFKA

Leyendo Kafka...


[Stage 52:>                                                         (0 + 1) / 1]

 Mensajes: 100,698



                                                                                

In [14]:
# ============================================================================
# 2. ELIMINAR REDUNDANTES Y PREPARAR DATOS
# ============================================================================
print("="*80)
print("PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS")
print("="*80 + "\n")

# Columnas redundantes a eliminar
redundant_columns = [
    "id_interno_sistema",
    "campo_vacio",
    "constante_1",
    "constante_2",
    "duplicate_id",
    "timestamp_carga"
]

print(f" Eliminando {len(redundant_columns)} columnas redundantes...")
df_cleaned = df_bronze.drop(*redundant_columns)

print(f"Columnas restantes: {len(df_cleaned.columns)}")
print()

print(" Preparando campo fecha_firma...")
print("   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)")
print("   Convirtiendo a: date (2024-01-04)")

df_cleaned = (
    df_cleaned
    .withColumn("fecha_firma_temp", to_timestamp(col("fecha_firma")))
    .withColumn("fecha_firma", to_date(col("fecha_firma_temp")))
    .drop("fecha_firma_temp")
)

print(" Fecha convertida correctamente\n")

# Liberar bronze ahora que ya no lo necesitamos
print(" Liberando memoria de df_bronze...")
df_bronze.unpersist()
print(" Memoria liberada\n")

print("="*80)
print(f" Dataset preparado: {len(df_cleaned.columns)} columnas")
print("="*80 + "\n")


PASO 2: ELIMINAR REDUNDANTES Y PREPARAR DATOS

 Eliminando 6 columnas redundantes...
Columnas restantes: 15

 Preparando campo fecha_firma...
   Formato recibido: ISO timestamp (2024-01-04T00:00:00.000)
   Convirtiendo a: date (2024-01-04)
 Fecha convertida correctamente

 Liberando memoria de df_bronze...
 Memoria liberada

 Dataset preparado: 15 columnas



In [15]:
# ============================================================================
# CELDA 1: PREPARACI√ìN Y CONTEO INICIAL
# ============================================================================
print("="*80)
print("PASO 3: LIMPIEZA - PREPARACI√ìN")
print("="*80 + "\n")

print("Cacheando datos para an√°lisis...")
df_cleaned = df_cleaned.cache()
total_cleaned = df_cleaned.count()

print(f" Registros totales: {total_cleaned:,}\n")
print(f" Columnas: {len(df_cleaned.columns)}")
print(f" Datos cacheados¬†en¬†memoria\n")

PASO 3: LIMPIEZA - PREPARACI√ìN

Cacheando datos para an√°lisis...


[Stage 56:>                                                         (0 + 1) / 1]

 Registros totales: 100,698

 Columnas: 15
 Datos cacheados¬†en¬†memoria



                                                                                

In [16]:
# ============================================================================
# CELDA 2: AN√ÅLISIS DE NULOS (OPTIMIZADO)
# ============================================================================
print("="*80)
print("AN√ÅLISIS DE CALIDAD DE DATOS")
print("="*80 + "\n")

print("Analizando valores nulos en columnas cr√≠ticas...")

# Solo analizar columnas cr√≠ticas para ahorrar memoria
critical_columns = [
    "id_contrato",
    "objeto_contrato", 
    "valor_contrato",
    "fecha_firma",
    "entidad",
    "departamento",
    "duracion_dias"
]

# An√°lisis optimizado solo de columnas cr√≠ticas
null_analysis = df_cleaned.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in critical_columns if c in df_cleaned.columns
]).collect()[0].asDict()

print("üìä Valores nulos en columnas cr√≠ticas:\n")
has_nulls = False
for col_name in critical_columns:
    if col_name in null_analysis:
        null_count = null_analysis[col_name]
        if null_count > 0:
            has_nulls = True
            pct = (null_count / total_cleaned) * 100
            print(f"   ‚ö†  {col_name}: {null_count:,} ({pct:.1f}%)")

if not has_nulls:
    print("   ‚úÖ No hay valores nulos en columnas cr√≠ticas")

print()


AN√ÅLISIS DE CALIDAD DE DATOS

Analizando valores nulos en columnas cr√≠ticas...
üìä Valores nulos en columnas cr√≠ticas:

   ‚ö†  fecha_firma: 695 (0.7%)
   ‚ö†  duracion_dias: 50,350 (50.0%)



In [17]:
# ============================================================================
# CELDA 3: APLICAR FILTROS DE LIMPIEZA
# ============================================================================
print("="*80)
print("APLICANDO FILTROS DE CALIDAD")
print("="*80 + "\n")

print("Aplicando reglas de limpieza:")
print("  ‚úì id_contrato no nulo")
print("  ‚úì objeto_contrato no nulo")
print("  ‚úì valor_contrato no nulo y > 0")
print("  ‚úì fecha_firma no nula")
print()

# Aplicar filtros paso a paso
# NOTA: fecha_firma ya fue convertida a date en el Paso 2
df_silver = df_cleaned \
    .filter(col("id_contrato").isNotNull()) \
    .filter(col("objeto_contrato").isNotNull()) \
    .filter(col("valor_contrato").isNotNull()) \
    .filter(col("valor_contrato") > 0) \
    .filter(col("fecha_firma").isNotNull())

print("‚úÖ Filtros aplicados correctamente\n")


APLICANDO FILTROS DE CALIDAD

Aplicando reglas de limpieza:
  ‚úì id_contrato no nulo
  ‚úì objeto_contrato no nulo
  ‚úì valor_contrato no nulo y > 0
  ‚úì fecha_firma no nula

‚úÖ Filtros aplicados correctamente



In [18]:
print(f"Spark version: {spark.version}")

Spark version: 3.5.1


In [19]:
# ============================================================================
# CELDA 4: CACHEAR RESULTADOS Y GENERAR REPORTE
# ============================================================================
print("="*80)
print("FINALIZANDO LIMPIEZA")
print("="*80 + "\n")

print("Cacheando datos limpios...")
df_silver = df_silver.cache()
total_silver = df_silver.count()

# Calcular estad√≠sticas
registros_descartados = total_cleaned - total_silver
pct_retenido = (total_silver / total_cleaned) * 100 if total_cleaned > 0 else 0
pct_descartado = (registros_descartados / total_cleaned) * 100 if total_cleaned > 0 else 0

print("\n" + "="*80)
print("üìä RESUMEN DE LIMPIEZA")
print("="*80)
print(f"  Registros iniciales:    {total_cleaned:,}")
print(f"  Registros finales:      {total_silver:,} ({pct_retenido:.1f}%)")
print(f"  Registros descartados:  {registros_descartados:,} ({pct_descartado:.1f}%)")
print("="*80 + "\n")

# Liberar memoria del DataFrame anterior
print("Liberando memoria del cache anterior...")
df_cleaned.unpersist()
print("‚úÖ Limpieza¬†completada\n")


FINALIZANDO LIMPIEZA

Cacheando datos limpios...

üìä RESUMEN DE LIMPIEZA
  Registros iniciales:    100,698
  Registros finales:      99,458 (98.8%)
  Registros descartados:  1,240 (1.2%)

Liberando memoria del cache anterior...
‚úÖ Limpieza¬†completada



In [20]:
# 1. Entidades
print("Top 5 entidades:")
df_silver.groupBy("entidad").count().orderBy(desc("count")).show(5, truncate=False)

# 2. Departamentos
print("\nTop 5 departamentos:")
df_silver.groupBy("departamento").count().orderBy(desc("count")).show(5, truncate=False)

# 3. Regi√≥n
print("\nDistribuci√≥n por regi√≥n:")
df_silver.groupBy("region").count().orderBy(desc("count")).show(truncate=False)

# 4. C√≥digo UNSPSC
print("\nTop 10 c√≥digos UNSPSC:")
df_silver.groupBy("codigo_unspsc").count().orderBy(desc("count")).show(10, truncate=False)

# 5. Categor√≠a UNSPSC
print("\nTop 10 categor√≠as UNSPSC:")
df_silver.groupBy("descripcion_categoria").count().orderBy(desc("count")).show(10, truncate=False)

# 6. Tipo de contrato
print("\nDistribuci√≥n por tipo de contrato:")
df_silver.groupBy("tipo_contrato").count().orderBy(desc("count")).show(truncate=False)

# 7. Estado del contrato
print("\nDistribuci√≥n del estado del contrato:")
df_silver.groupBy("estado_contrato").count().orderBy(desc("count")).show(truncate=False)

# 8. Modalidad de contrataci√≥n
print("\nTop 10 modalidades de contrataci√≥n:")
df_silver.groupBy("modalidad").count().orderBy(desc("count")).show(10, truncate=False)


Top 5 entidades:
+-------------------------------------------------+-----+
|entidad                                          |count|
+-------------------------------------------------+-----+
|MUNICIPIO DE SOACHA.                             |6356 |
|ALCALD√çA MUNICIPAL COTA                          |3988 |
|ESE MUNICIPAL DE SOACHA JULIO CESAR PE√ëALOZA*    |3822 |
|CUNDINAMARCA-ALCALDIA MUNICIPIO MOSQUERA         |3759 |
|empresa social del estado regi√≥n de salud soacha.|3152 |
+-------------------------------------------------+-----+
only showing top 5 rows


Top 5 departamentos:
+------------+-----+
|departamento|count|
+------------+-----+
|Cundinamarca|99458|
+------------+-----+


Distribuci√≥n por regi√≥n:
+--------------+-----+
|region        |count|
+--------------+-----+
|Centro-Oriente|99458|
+--------------+-----+


Top 10 c√≥digos UNSPSC:
+-------------+-----+
|codigo_unspsc|count|
+-------------+-----+
|             |50058|
|V1.80111600  |11391|
|V1.80111701  |4329 |
|V1.

In [22]:
from pyspark.sql.functions import min, max, avg, stddev, expr

# 10. Valor del contrato
print("\nEstad√≠sticas de valor_contrato:")
df_silver.select(
    min("valor_contrato").alias("min"),
    max("valor_contrato").alias("max"),
    avg("valor_contrato").alias("mean"),
    stddev("valor_contrato").alias("std")
).show()

# Percentiles
print("\nPercentiles de valor_contrato:")
df_silver.approxQuantile("valor_contrato", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)

# 11. Duraci√≥n en d√≠as
print("\nEstad√≠sticas de duracion_dias:")
df_silver.select(
    min("duracion_dias").alias("min"),
    max("duracion_dias").alias("max"),
    avg("duracion_dias").alias("mean"),
    stddev("duracion_dias").alias("std")
).show()

print("\nPercentiles de duracion_dias:")
df_silver.approxQuantile("duracion_dias", [0.01, 0.25, 0.5, 0.75, 0.99], 0.01)



Estad√≠sticas de valor_contrato:
+---+----------------+-------------------+--------------------+
|min|             max|               mean|                 std|
+---+----------------+-------------------+--------------------+
|1.0|1.50838540149E11|9.941466321590018E7|1.1521186504414532E9|
+---+----------------+-------------------+--------------------+


Percentiles de valor_contrato:

Estad√≠sticas de duracion_dias:
+---+----+-----------------+------------------+
|min| max|             mean|               std|
+---+----+-----------------+------------------+
|  0|4297|82.47422012591348|101.20091534465666|
+---+----+-----------------+------------------+


Percentiles de duracion_dias:


[0.0, 6.0, 40.0, 125.0, 4297.0]

In [24]:
print("\nTop a√±os:")
df_silver.groupBy("anno").count().orderBy(desc("anno")).show(10, truncate=False)

print("\nContratos por a√±o:")
df_silver.groupBy("anno").count().orderBy(desc("count")).show(10, truncate=False)

print("\nTop fechas de firma:")
df_silver.groupBy("fecha_firma").count().orderBy(desc("count")).show(10, truncate=False)



Top a√±os:
+----+-----+
|anno|count|
+----+-----+
|2025|68   |
|2024|95797|
|2023|3029 |
|2022|564  |
+----+-----+


Contratos por a√±o:
+----+-----+
|anno|count|
+----+-----+
|2024|95797|
|2023|3029 |
|2022|564  |
|2025|68   |
+----+-----+


Top fechas de firma:
+-----------+-----+
|fecha_firma|count|
+-----------+-----+
|2024-02-01 |1230 |
|2024-03-01 |1119 |
|2024-02-02 |860  |
|2024-02-05 |815  |
|2024-03-22 |794  |
|2024-02-09 |789  |
|2024-02-16 |744  |
|2024-02-06 |723  |
|2024-02-12 |715  |
|2024-09-02 |686  |
+-----------+-----+
only showing top 10 rows



In [25]:
# ============================================================================
# 5. GUARDAR EN DELTA LAKE
# ============================================================================

print("="*80)
print("PASO 5: GUARDAR EN DELTA LAKE")
print("="*80 + "\n")

DELTA_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"üíæ Guardando en: {DELTA_PATH}")

df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(DELTA_PATH)

print("‚úÖ Guardado exitosamente\n")

# ‚ö†Ô∏è LIBERAR todo
df_silver.unpersist()
spark.catalog.clearCache()
 

PASO 5: GUARDAR EN DELTA LAKE

üíæ Guardando en: /app/notebooks/delta_lake/silver_contracts


                                                                                

‚úÖ Guardado exitosamente



## Fase 3

In [2]:
print("\n" + "="*80)
print("PASO 1: CARGAR DATOS DESDE SILVER")
print("="*80 + "\n")

SILVER_PATH = "/app/notebooks/delta_lake/silver_contracts"
print(f"üìä Cargando: {SILVER_PATH}")

df_silver = spark.read.format("delta").load(SILVER_PATH)
df_silver = df_silver.cache()
total_records = df_silver.count()

print(f"‚úì Registros: {total_records:,}\n")


PASO 1: CARGAR DATOS DESDE SILVER

üìä Cargando: /app/notebooks/delta_lake/silver_contracts




‚úì Registros: 99,458



                                                                                

In [3]:
# ============================================================================
# PASO 2: LIMPIEZA DE TEXTO
# ============================================================================

print("="*80)
print("PASO 2: LIMPIEZA DE TEXTO")
print("="*80 + "\n")

src_chars = "√°√©√≠√≥√∫√º√±"
dst_chars = "aeiouun"

df_prepared = df_silver.withColumn(
    "objeto_limpio",
    trim(
        regexp_replace(
            regexp_replace(
                translate(
                    lower(col("objeto_contrato")),
                    src_chars,
                    dst_chars
                ),
                "[^a-z0-9\\s]", " "
            ),
            "\\s+", " "
        )
    )
).filter(length(col("objeto_limpio")) >= 10)

print(f"‚úì Registros despu√©s de limpieza: {df_prepared.count():,}\n")


PASO 2: LIMPIEZA DE TEXTO





‚úì Registros despu√©s de limpieza: 99,458



                                                                                

In [4]:
# ============================================================================
# PASO 3: TOKENIZACI√ìN Y STOPWORDS
# ============================================================================

print("="*80)
print("PASO 3: TOKENIZACI√ìN")
print("="*80 + "\n")

stopwords_es = [
    "el", "la", "de", "que", "y", "a", "en", "un", "ser", "se", "no",
    "por", "con", "su", "para", "como", "estar", "tener", "le", "lo",
    "pero", "hacer", "o", "este", "otro", "ese", "si", "ya", "ver",
    "dar", "muy", "sin", "sobre", "tambi√©n", "hasta", "a√±o", "entre",
    "del", "al", "los", "las", "uno", "una", "unos", "unas",
    "contrato", "contratos", "objeto", "prestacion", "prestaci√≥n",
    "servicio", "servicios", "suministro", "ejecucion", "ejecuci√≥n"
]

tokenizer = Tokenizer(inputCol="objeto_limpio", outputCol="palabras")
df_tokenized = tokenizer.transform(df_prepared)

remover = StopWordsRemover(
    inputCol="palabras",
    outputCol="palabras_sin_stopwords",
    stopWords=stopwords_es
)
df_filtered_words = remover.transform(df_tokenized)

# Filtrar palabras cortas
def clean_words(words):
    if not words:
        return []
    return [w for w in words if len(w) >= 3]

clean_udf = udf(clean_words, ArrayType(StringType()))

df_filtered = df_filtered_words.withColumn(
    "palabras_filtradas",
    clean_udf(col("palabras_sin_stopwords"))
).filter(size(col("palabras_filtradas")) > 0)

print(f"‚úì Registros despu√©s de filtrado: {df_filtered.count():,}\n")


PASO 3: TOKENIZACI√ìN





‚úì Registros despu√©s de filtrado: 99,458



                                                                                

In [5]:
# ============================================================================
# PASO 4: WORD2VEC
# ============================================================================

print("="*80)
print("PASO 4: WORD2VEC")
print("="*80 + "\n")

word2vec = Word2Vec(
    vectorSize=100,
    minCount=2,
    maxIter=10,
    seed=42,
    inputCol="palabras_filtradas",
    outputCol="embedding_raw"
)

print("‚è≥ Entrenando Word2Vec...")
word2vec_model = word2vec.fit(df_filtered)
df_embeddings = word2vec_model.transform(df_filtered)

vocab_size = len(word2vec_model.getVectors().collect())
print(f"‚úì Vocabulario: {vocab_size:,} palabras")
print(f"‚úì Embeddings¬†generados\n")


PASO 4: WORD2VEC

‚è≥ Entrenando Word2Vec...


                                                                                

‚úì Vocabulario: 14,465 palabras
‚úì Embeddings¬†generados



In [6]:
# ----------------------------------------------------------------
# 3. Transformaciones INDEPENDIENTES del target
# ----------------------------------------------------------------
print("\n2. Transformaciones categ√≥ricas (sin target)...")

from pyspark.ml.feature import StringIndexer, OneHotEncoder
import pyspark.sql.functions as F

# A. OneHot para variables de baja cardinalidad
low_card_cols = ["tipo_contrato", "estado_contrato", "modalidad"]

for col_name in low_card_cols:
    print(f"   OneHot para {col_name}...")
    
    indexer = StringIndexer(
        inputCol=col_name,
        outputCol=f"{col_name}_idx",
        handleInvalid="keep"
    )
    df_embeddings = indexer.fit(df_embeddings).transform(df_embeddings)
    
    encoder = OneHotEncoder(
        inputCol=f"{col_name}_idx",
        outputCol=f"{col_name}_ohe",
        dropLast=True
    )
    df_embeddings = encoder.fit(df_embeddings).transform(df_embeddings)

# B. Frequency Encoding para 'entidad'
print("\n   Frequency Encoding para 'entidad'...")
entidad_freq = df_embeddings.groupBy("entidad").count()
total_count = df_embeddings.count()
entidad_freq = entidad_freq.withColumn(
    "entidad_freq",
    col("count") / total_count
).select("entidad", "entidad_freq")

df_embeddings = df_embeddings.join(entidad_freq, "entidad", "left")

# C. Eliminar variables sin varianza
df_embeddings = df_embeddings.drop("departamento", "region")



2. Transformaciones categ√≥ricas (sin target)...
   OneHot para tipo_contrato...


                                                                                

   OneHot para estado_contrato...


                                                                                

   OneHot para modalidad...


                                                                                


   Frequency Encoding para 'entidad'...


                                                                                

In [7]:
# ----------------------------------------------------------------
# 4. DIVISI√ìN TEMPORAL (80/20)
# ----------------------------------------------------------------
print("\n3. Divisi√≥n temporal train/test...")

from datetime import datetime

# Convertir fecha para calcular percentil
df_temp = df_embeddings.withColumn(
    "fecha_num",
    col("fecha_firma").cast("timestamp").cast("long")
)

# Calcular percentil 80
q = df_temp.approxQuantile("fecha_num", [0.8], 0.01)
split_ts = q[0]
split_date = datetime.utcfromtimestamp(split_ts)

print(f"   Fecha de corte: {split_date}")

# Crear datasets de train y test
df_train_raw = df_embeddings.filter(col("fecha_firma") <= split_date)
df_test_raw = df_embeddings.filter(col("fecha_firma") > split_date)

print(f"   Train: {df_train_raw.count():,} registros")
print(f"   Test:  {df_test_raw.count():,} registros")



3. Divisi√≥n temporal train/test...


                                                                                

   Fecha de corte: 2024-09-27 00:00:00


                                                                                

   Train: 79,298 registros




   Test:  20,160 registros


                                                                                

In [8]:
# ----------------------------------------------------------------
# 5. Guardar datasets en Delta Lake
# ----------------------------------------------------------------
print("\n4. Guardando datasets en Delta Lake...")

# Ruta para los datasets preprocesados
TRAIN_RAW_PATH = "/app/notebooks/delta_lake/train_raw_v3"
TEST_RAW_PATH = "/app/notebooks/delta_lake/test_raw_v3"

# Guardar train
df_train_raw.write.format("delta").mode("overwrite").save(TRAIN_RAW_PATH)

# Guardar test
df_test_raw.write.format("delta").mode("overwrite").save(TEST_RAW_PATH)

print(f"   ‚úì Train guardado en: {TRAIN_RAW_PATH}")
print(f"   ‚úì Test guardado en:  {TEST_RAW_PATH}")

# ----------------------------------------------------------------
# 6. Guardar tambi√©n los modelos de transformaci√≥n
# ----------------------------------------------------------------
print("\n5. Guardando modelos de transformaci√≥n...")

MODELS_PATH = "/app/notebooks/models_v3"

# Guardar Word2Vec model (si lo tienes)
if 'word2vec_model' in locals():
    word2vec_model.save(f"{MODELS_PATH}/word2vec_model")

# Guardar StringIndexer models para referencia
for col_name in low_card_cols:
    indexer_model_path = f"{MODELS_PATH}/indexer_{col_name}"
    # Necesitar√≠as extraer el modelo del pipeline o guardar los mapeos

print("Modelos guardados")



4. Guardando datasets en Delta Lake...


                                                                                

   ‚úì Train guardado en: /app/notebooks/delta_lake/train_raw_v3
   ‚úì Test guardado en:  /app/notebooks/delta_lake/test_raw_v3

5. Guardando modelos de transformaci√≥n...


                                                                                

Modelos guardados


In [9]:
# ----------------------------------------------------------------
# 7. Informe final
# ----------------------------------------------------------------
print("\n" + "="*80)
print("RESUMEN FASE 3")
print("="*80)

print(f"‚úÖ Preprocesamiento completado")
print(f"üìä Train: {df_train_raw.count():,} registros")
print(f"üìä Test:  {df_test_raw.count():,} registros")
print()
print("üéØ Variables disponibles:")
print(f"  - embedding_raw: Word2Vec embeddings")
for col in low_card_cols:
    print(f"  - {col}_ohe: OneHot encoded")
print(f"  - entidad_freq: Frequency encoding")
print(f"  - valor_contrato: Target variable")
print(f"  - duracion_dias: Variable num√©rica")
print()
print("üìà Listo para Fase 4: Target Encoding y Modelado")


RESUMEN FASE 3
‚úÖ Preprocesamiento completado


                                                                                

üìä Train: 79,298 registros




üìä Test:  20,160 registros

üéØ Variables disponibles:
  - embedding_raw: Word2Vec embeddings
  - tipo_contrato_ohe: OneHot encoded
  - estado_contrato_ohe: OneHot encoded
  - modalidad_ohe: OneHot encoded
  - entidad_freq: Frequency encoding
  - valor_contrato: Target variable
  - duracion_dias: Variable num√©rica

üìà Listo para Fase 4: Target Encoding y Modelado


                                                                                

## Fase 4

In [2]:
import builtins  # <-- IMPORTANTE: Importar builtins
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# ================================================================
# FASE 4: MODELADO Y REGISTRO EN MLFLOW (CON LOG TRANSFORM)
# ================================================================

print("="*80)
print("FASE 4: MODELADO CON TRANSFORMACI√ìN LOGAR√çTMICA")
print("="*80)

# ----------------------------------------------------------------
# 1. Cargar datasets preprocesados
# ----------------------------------------------------------------
print("1. Cargando datasets preprocesados...")

TRAIN_RAW_PATH = "/app/notebooks/delta_lake/train_raw_v3"
TEST_RAW_PATH = "/app/notebooks/delta_lake/test_raw_v3"

train_raw = spark.read.format("delta").load(TRAIN_RAW_PATH).cache()
test_raw = spark.read.format("delta").load(TEST_RAW_PATH).cache()

print(f"   ‚úì Train: {train_raw.count():,} registros")
print(f"   ‚úì Test:  {test_raw.count():,} registros")


FASE 4: MODELADO CON TRANSFORMACI√ìN LOGAR√çTMICA
1. Cargando datasets preprocesados...




   ‚úì Train: 79,298 registros




   ‚úì Test:  20,160 registros


                                                                                

In [4]:
# ----------------------------------------------------------------
# 2. TRANSFORMACI√ìN LOGAR√çTMICA DEL TARGET
# ----------------------------------------------------------------
import pyspark.sql.functions as F
print("\n2. Aplicando transformaci√≥n logar√≠tmica al target...")

# Usamos log1p = log(1 + x) para evitar problemas con valores peque√±os
train_data = train_raw.withColumn("log_valor_contrato", F.log1p(col("valor_contrato")))
test_data = test_raw.withColumn("log_valor_contrato", F.log1p(col("valor_contrato")))

# Verificar estad√≠sticas antes/despu√©s
print("   Estad√≠sticas del target original vs logar√≠tmico:")
train_stats = train_data.select(
    F.mean("valor_contrato").alias("mean_original"),
    F.stddev("valor_contrato").alias("std_original"),
    F.mean("log_valor_contrato").alias("mean_log"),
    F.stddev("log_valor_contrato").alias("std_log")
).first()

print(f"   Original: mean=${train_stats['mean_original']:,.2f}, std=${train_stats['std_original']:,.2f}")
print(f"   Log: mean={train_stats['mean_log']:.2f}, std={train_stats['std_log']:.2f}")


2. Aplicando transformaci√≥n logar√≠tmica al target...
   Estad√≠sticas del target original vs logar√≠tmico:
   Original: mean=$81,402,743.06, std=$981,240,307.25
   Log: mean=16.72, std=1.12


In [5]:
# ----------------------------------------------------------------
# 3. Target Encoding usando el TARGET LOGAR√çTMICO
# ----------------------------------------------------------------
print("\n3. Target Encoding para 'codigo_unspsc' (usando target log)...")

def safe_target_encoding_log(train_df, test_df, cat_col, target_log_col="log_valor_contrato", m=50):
    """
    Target encoding usando el target en escala logar√≠tmica
    """
    # Calcular media global del LOG en train
    global_mean_log = train_df.agg(F.mean(target_log_col)).first()[0]
    
    # Calcular estad√≠sticas por categor√≠a en train (usando LOG)
    stats = train_df.groupBy(cat_col).agg(
        F.mean(target_log_col).alias("cat_mean_log"),
        F.count(target_log_col).alias("cat_count")
    )
    
    # Aplicar smoothing en escala logar√≠tmica
    stats = stats.withColumn(
        f"{cat_col}_te_log",
        (F.col("cat_count") * F.col("cat_mean_log") + m * global_mean_log) / 
        (F.col("cat_count") + m)
    ).select(cat_col, f"{cat_col}_te_log")
    
    # Aplicar a train
    train_encoded = train_df.join(stats, cat_col, "left")
    
    # Aplicar a test
    test_encoded = test_df.join(stats, cat_col, "left")
    
    # Para categor√≠as no vistas en train, usar global mean log
    test_encoded = test_encoded.fillna({f"{cat_col}_te_log": global_mean_log})
    
    return train_encoded, test_encoded

# Aplicar target encoding con target logar√≠tmico
train_data, test_data = safe_target_encoding_log(
    train_data, test_data,
    cat_col="codigo_unspsc",
    target_log_col="log_valor_contrato",
    m=50
)

print("   ‚úì codigo_unspsc_te_log creado (en escala log)")


3. Target Encoding para 'codigo_unspsc' (usando target log)...
   ‚úì codigo_unspsc_te_log creado (en escala log)


In [6]:
# ----------------------------------------------------------------
# 4. Ensamblar features (incluyendo el target encoding logar√≠tmico)
# ----------------------------------------------------------------
print("\n4. Ensamblando features...")

feature_cols = [
    "embedding_raw",
    "tipo_contrato_ohe",
    "estado_contrato_ohe", 
    "modalidad_ohe",
    "entidad_freq",
    "codigo_unspsc_te_log",  # ¬°Usamos la versi√≥n logar√≠tmica!
]

if "duracion_dias" in train_data.columns:
    train_data = train_data.fillna({"duracion_dias": 0})
    test_data = test_data.fillna({"duracion_dias": 0})
    feature_cols.append("duracion_dias")

print(f"   Features: {len(feature_cols)} dimensiones")
for feat in feature_cols:
    print(f"   - {feat}")

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw",
    handleInvalid="skip"
)

train_features = assembler.transform(train_data)
test_features = assembler.transform(test_data)



4. Ensamblando features...
   Features: 7 dimensiones
   - embedding_raw
   - tipo_contrato_ohe
   - estado_contrato_ohe
   - modalidad_ohe
   - entidad_freq
   - codigo_unspsc_te_log
   - duracion_dias


In [7]:
# ----------------------------------------------------------------
# 5. Normalizaci√≥n (CON VALIDACI√ìN)
# ----------------------------------------------------------------
print("\n5. Normalizando y validando...")

scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features_scaled",
    withStd=True,
    withMean=True
)

scaler_model = scaler.fit(train_features)
train_scaled = scaler_model.transform(train_features)
test_scaled = scaler_model.transform(test_features)


5. Normalizando y validando...


                                                                                

In [8]:
# ----------------------------------------------------------------
# 6. OMITIR PCA
# ----------------------------------------------------------------
print("\n6. Omitiendo PCA - usando features escaladas directamente")

train_final = train_scaled.select(
    col("log_valor_contrato").alias("label_log"),
    col("features_scaled").alias("features"),
    col("valor_contrato")
).cache()

test_final = test_scaled.select(
    col("log_valor_contrato").alias("label_log"),
    col("features_scaled").alias("features"),
    col("valor_contrato")
).cache()

dimensiones = len(train_final.select("features").first()[0])
print(f"   Dimensiones finales: {dimensiones}")

# Liberar DataFrames intermedios que ya no se usan
train_scaled.unpersist()
test_scaled.unpersist()
 


6. Omitiendo PCA - usando features escaladas directamente
   Dimensiones finales: 141


DataFrame[codigo_unspsc: string, entidad: string, id_contrato: string, objeto_contrato: string, municipio: string, descripcion_categoria: string, valor_contrato: double, duracion_dias: int, fecha_firma: date, tipo_contrato: string, estado_contrato: string, modalidad: string, anno: int, objeto_limpio: string, palabras: array<string>, palabras_sin_stopwords: array<string>, palabras_filtradas: array<string>, embedding_raw: vector, tipo_contrato_idx: double, tipo_contrato_ohe: vector, estado_contrato_idx: double, estado_contrato_ohe: vector, modalidad_idx: double, modalidad_ohe: vector, entidad_freq: double, log_valor_contrato: double, codigo_unspsc_te_log: double, features_raw: vector, features_scaled: vector]

In [9]:
# ----------------------------------------------------------------
# 2. RANDOM FOREST (entrenado y evaluado en escala logar√≠tmica)
# ----------------------------------------------------------------
print("\n" + "-"*80)
print("2. RANDOM FOREST (escala logar√≠tmica)")

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="label_log",
    numTrees=30,
    maxDepth=8,
    maxBins=32,
    subsamplingRate=0.8,
    seed=42
)

try:
    # ----------------------------------------------------------------
    # ENTRENAR MODELO
    # ----------------------------------------------------------------
    print("\nüîß Entrenando Random Forest...")
    rf_model = rf.fit(train_final)
    print("   ‚úÖ Modelo entrenado correctamente")

    # ----------------------------------------------------------------
    # CALCULAR SIGMA EN TRAIN SET (ESCALA LOGAR√çTMICA)
    # ----------------------------------------------------------------
    print("\nüìê Calculando œÉ en TRAIN set (escala logar√≠tmica)...")

    train_predictions = rf_model.transform(train_final)

    train_with_residuals = train_predictions.withColumn(
        "residual_log",
        F.col("label_log") - F.col("prediction")
    )

    sigma_log = train_with_residuals.agg(
        F.stddev("residual_log").alias("sigma")
    ).first()["sigma"]

    threshold_log = 2.8 * sigma_log

    print(f"   œÉ (escala log): {sigma_log:.4f}")
    print(f"   Threshold 2.8œÉ: {threshold_log:.4f}")
    print("   ‚úì Sigma calculado y listo para MLflow")

    # ----------------------------------------------------------------
    # PREDECIR EN TEST SET
    # ----------------------------------------------------------------
    print("\nüîÆ Generando predicciones en TEST...")
    rf_predictions = rf_model.transform(test_final)
    print("   ‚úì Predicciones generadas")

    # ----------------------------------------------------------------
    # M√âTRICAS EN ESCALA LOGAR√çTMICA
    # ----------------------------------------------------------------
    print("\nüìä Calculando m√©tricas en escala LOGAR√çTMICA...")

    evaluator_r2 = RegressionEvaluator(
        labelCol="label_log",
        predictionCol="prediction",
        metricName="r2"
    )
    evaluator_rmse = RegressionEvaluator(
        labelCol="label_log",
        predictionCol="prediction",
        metricName="rmse"
    )
    evaluator_mae = RegressionEvaluator(
        labelCol="label_log",
        predictionCol="prediction",
        metricName="mae"
    )

    rf_r2_log = evaluator_r2.evaluate(rf_predictions)
    rf_rmse_log = evaluator_rmse.evaluate(rf_predictions)
    rf_mae_log  = evaluator_mae.evaluate(rf_predictions)

    print(f"\nüìä RESULTADOS RANDOM FOREST (escala logar√≠tmica):")
    print(f"   R¬≤:    {rf_r2_log:.4f}")
    print(f"   RMSE:  {rf_rmse_log:.4f}")
    print(f"   MAE:   {rf_mae_log:.4f}")

except Exception as e:
    print(f"\n‚ö†Ô∏è Error con Random Forest: {str(e)[:120]}")
    print("   ‚Üí Probable falta de memoria o problema con DF.")



--------------------------------------------------------------------------------
2. RANDOM FOREST (escala logar√≠tmica)

üîß Entrenando Random Forest...


                                                                                

   ‚úÖ Modelo entrenado correctamente

üìê Calculando œÉ en TRAIN set (escala logar√≠tmica)...


                                                                                

   œÉ (escala log): 0.7763
   Threshold 2.8œÉ: 2.1736
   ‚úì Sigma calculado y listo para MLflow

üîÆ Generando predicciones en TEST...
   ‚úì Predicciones generadas

üìä Calculando m√©tricas en escala LOGAR√çTMICA...

üìä RESULTADOS RANDOM FOREST (escala logar√≠tmica):
   R¬≤:    0.4395
   RMSE:  1.2620
   MAE:   0.9908


In [15]:
# ----------------------------------------------------------------
# REGISTRAR EN MLFLOW
# ----------------------------------------------------------------
print("\n" + "="*80)
print("REGISTRO EN MLFLOW - RANDOM FOREST")
print("="*80)

import mlflow
import mlflow.spark
import json
from datetime import datetime

# Configurar MLflow
MLFLOW_TRACKING_URI = "http://172.17.0.1:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("contract_value_regression_log")

# Iniciar run de MLflow
with mlflow.start_run(run_name=f"random_forest_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):

    # ========== Registrar par√°metros ==========
    print("üìù Registrando par√°metros...")

    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("target_transform", "log1p")
    mlflow.log_param("numTrees", 30)
    mlflow.log_param("maxDepth", 8)
    mlflow.log_param("maxBins", 32)
    mlflow.log_param("subsamplingRate", 0.8)
    mlflow.log_param("seed", 42)
    mlflow.log_param("train_size", train_final.count())
    mlflow.log_param("test_size", test_final.count())
    mlflow.log_param("features_count", len(feature_cols))
    mlflow.log_param("target_encoding_smoothing", 50)
    mlflow.log_param("target_encoding_scale", "logarithmic")

    print("   ‚úì Par√°metros registrados")

    # ========== Registrar m√©tricas (SOLO ESCALA LOGAR√çTMICA) ==========
    print("üìà Registrando m√©tricas...")

    mlflow.log_metric("test_r2_log", rf_r2_log)
    mlflow.log_metric("test_rmse_log", rf_rmse_log)
    mlflow.log_metric("test_mae_log", rf_mae_log)

    # ‚≠ê M√©tricas para detecci√≥n de anomal√≠as
    mlflow.log_metric("sigma_log_train", sigma_log)
    mlflow.log_metric("anomaly_threshold_log", threshold_log)

    print(f"   ‚úì M√©tricas registradas")
    print(f"   ‚úì sigma_log_train: {sigma_log:.6f} ‚Üê GUARDADO EN MLFLOW")
    print(f"   ‚úì anomaly_threshold_log: {threshold_log:.6f} ‚Üê GUARDADO EN MLFLOW")

    # ========== Registrar modelo ==========
    print("üíæ Registrando modelo...")

    mlflow.spark.log_model(
        spark_model=rf_model,
        artifact_path="model_log",
        registered_model_name="contract_value_predictor_rf_log_v1"
    )

    print("   ‚úì Modelo registrado")

    # ========== Registrar artifacts ==========
    print("üìé Registrando artifacts...")

    metrics_dict = {
        "model_type": "RandomForestRegressor",
        "target_transform": "log1p",
        "hyperparameters": {
            "numTrees": 30,
            "maxDepth": 8,
            "maxBins": 32,
            "subsamplingRate": 0.8,
            "seed": 42
        },
        "metrics_log": {
            "r2": float(rf_r2_log),
            "rmse": float(rf_rmse_log),
            "mae": float(rf_mae_log)
        },
        "anomaly_detection": {
            "sigma_log_train": float(sigma_log),
            "threshold_2.8sigma_log": float(threshold_log),
            "method": "log_scale_residuals"
        },
        "data_info": {
            "train_size": int(train_final.count()),
            "test_size": int(test_final.count()),
            "features_count": len(feature_cols)
        },
        "run_timestamp": datetime.now().isoformat()
    }

    metrics_path = "/tmp/rf_model_metrics_log.json"
    with open(metrics_path, 'w') as f:
        json.dump(metrics_dict, f, indent=2)

    mlflow.log_artifact(metrics_path, "metrics")

    print("   ‚úì Artifacts registrados")

    # ========== Registrar tags ==========
    print("üè∑Ô∏è  Registrando tags...")

    mlflow.set_tag("framework", "PySpark")
    mlflow.set_tag("spark_version", spark.version)
    mlflow.set_tag("model_version", "v1.0_rf_log")
    mlflow.set_tag("data_source", "contratos_publicos")
    mlflow.set_tag("target_variable", "valor_contrato")
    mlflow.set_tag("target_transform", "log1p")
    mlflow.set_tag("encoding_strategy", "hybrid_log")
    mlflow.set_tag("algorithm", "RandomForest")
    mlflow.set_tag("anomaly_detection_method", "sigma_log_train")

    print("   ‚úì Tags registrados")

    # ========== Informaci√≥n del run ==========
    run_id = mlflow.active_run().info.run_id
    experiment_id = mlflow.active_run().info.experiment_id

    print(f"\n‚úÖ RUN COMPLETADO - RANDOM FOREST:")
    print(f"   Run ID: {run_id}")
    print(f"   Experiment ID: {experiment_id}")
    print(f"   MLflow UI: {MLFLOW_TRACKING_URI}")
    print(f"\n   üìä M√©tricas (escala logar√≠tmica):")
    print(f"      R¬≤:   {rf_r2_log:.4f}")
    print(f"      RMSE: {rf_rmse_log:.4f}")
    print(f"      MAE:  {rf_mae_log:.4f}")
    print(f"\n   üéØ Detecci√≥n de anomal√≠as:")
    print(f"      œÉ (log, train): {sigma_log:.4f}")
    print(f"      Threshold 2.8œÉ: {threshold_log:.4f}")

print(f"\n‚úÖ Variables guardadas para detecci√≥n de anomal√≠as:")
print(f"   - rf_model: Modelo entrenado")
print(f"   - rf_predictions: Predicciones en test set")
print(f"   - sigma_log: {sigma_log:.4f}")
print(f"   - threshold_log: {threshold_log:.4f}")

 


REGISTRO EN MLFLOW - RANDOM FOREST
üìù Registrando par√°metros...
   ‚úì Par√°metros registrados
üìà Registrando m√©tricas...
   ‚úì M√©tricas registradas
   ‚úì sigma_log_train: 0.776302 ‚Üê GUARDADO EN MLFLOW
   ‚úì anomaly_threshold_log: 2.173647 ‚Üê GUARDADO EN MLFLOW
üíæ Registrando modelo...


Registered model 'contract_value_predictor_rf_log_v1' already exists. Creating a new version of this model...
2025/12/11 20:45:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: contract_value_predictor_rf_log_v1, version 4
Created version '4' of model 'contract_value_predictor_rf_log_v1'.


   ‚úì Modelo registrado
üìé Registrando artifacts...


                                                                                

   ‚úì Artifacts registrados
üè∑Ô∏è  Registrando tags...
   ‚úì Tags registrados

‚úÖ RUN COMPLETADO - RANDOM FOREST:
   Run ID: 74280a268f2a4be8b98d49bd88277335
   Experiment ID: 3
   MLflow UI: http://172.17.0.1:5000

   üìä M√©tricas (escala logar√≠tmica):
      R¬≤:   0.4395
      RMSE: 1.2620
      MAE:  0.9908

   üéØ Detecci√≥n de anomal√≠as:
      œÉ (log, train): 0.7763
      Threshold 2.8œÉ: 2.1736
üèÉ View run random_forest_log_20251211_204524 at: http://172.17.0.1:5000/#/experiments/3/runs/74280a268f2a4be8b98d49bd88277335
üß™ View experiment at: http://172.17.0.1:5000/#/experiments/3

‚úÖ Variables guardadas para detecci√≥n de anomal√≠as:
   - rf_model: Modelo entrenado
   - rf_predictions: Predicciones en test set
   - sigma_log: 0.7763
   - threshold_log: 2.1736


## Fase 5

In [16]:
# ================================================================
# FASE 5: DETECCI√ìN DE AT√çPICOS (REGLA DE NEGOCIO)
# ================================================================
# Implementaci√≥n de la regla del tablero para detectar corrupci√≥n 
# o sobrecostos en contratos p√∫blicos
# ================================================================

print("\n" + "="*80)
print("FASE 5: DETECCI√ìN DE AT√çPICOS (REGLA DE NEGOCIO)")
print("="*80)

# ----------------------------------------------------------------
# 1. VERIFICAR QUE TENEMOS SIGMA_LOG CALCULADO
# ----------------------------------------------------------------
print("\n1Ô∏è‚É£ Verificando sigma calculado...")

if 'sigma_log' not in locals():
    raise ValueError("‚ùå sigma_log no est√° definido. Aseg√∫rate de haber ejecutado el entrenamiento del modelo primero.")

if 'rf_predictions' not in locals():
    raise ValueError("‚ùå rf_predictions no est√° definido. Aseg√∫rate de haber ejecutado las predicciones primero.")

print(f"   ‚úÖ sigma_log disponible: {sigma_log:.6f}")
print(f"   ‚úÖ Threshold 2.8œÉ: {threshold_log:.6f}")

# ----------------------------------------------------------------
# 2. APLICAR REGLA DE DETECCI√ìN EN TEST SET
# ----------------------------------------------------------------
print("\n2Ô∏è‚É£ Aplicando regla de detecci√≥n...")

# Calcular desviaci√≥n en ESCALA LOGAR√çTMICA
# Desviaci√≥n = ValorReal - ValorPredicho
test_with_deviation = rf_predictions.withColumn(
    "desviacion_log",
    F.col("label_log") - F.col("prediction")  # Real - Predicho (en escala log)
)

# Aplicar regla del tablero:
# Si ValorReal > (ValorPredicho + 2.8œÉ) ‚Üí AT√çPICO
# Equivalente a: Si Desviaci√≥n > 2.8œÉ ‚Üí AT√çPICO
test_with_anomalies = test_with_deviation.withColumn(
    "anomaly_flag",
    F.when(F.col("desviacion_log") > threshold_log, "ATIPICO")
     .otherwise("LIBRE")
)

print(f"   ‚úÖ Regla aplicada: Desviaci√≥n > {threshold_log:.4f} ‚Üí AT√çPICO")

# ----------------------------------------------------------------
# 3. CALCULAR ESTAD√çSTICAS DE DETECCI√ìN
# ----------------------------------------------------------------
print("\n3Ô∏è‚É£ Calculando estad√≠sticas...")

# Contar at√≠picos y libres
anomaly_counts = test_with_anomalies.groupBy("anomaly_flag").count().collect()
anomaly_dict = {row["anomaly_flag"]: row["count"] for row in anomaly_counts}

total_contratos = test_with_anomalies.count()
atipicos = anomaly_dict.get("ATIPICO", 0)
libres = anomaly_dict.get("LIBRE", 0)
pct_atipicos = (atipicos / total_contratos * 100) if total_contratos > 0 else 0

print(f"\nüìä RESULTADOS DE DETECCI√ìN:")
print(f"   Total contratos analizados: {total_contratos:,}")
print(f"   Contratos AT√çPICOS: {atipicos:,} ({pct_atipicos:.2f}%)")
print(f"   Contratos LIBRES: {libres:,} ({100-pct_atipicos:.2f}%)")

# ----------------------------------------------------------------
# 4. AGREGAR COLUMNAS ADICIONALES PARA AN√ÅLISIS
# ----------------------------------------------------------------
print("\n4Ô∏è‚É£ Enriqueciendo datos...")

# Calcular Z-score (cu√°ntos sigmas se desv√≠a)
gold_anomalies = test_with_anomalies.withColumn(
    "z_score",
    F.col("desviacion_log") / sigma_log
).withColumn(
    "sigma_threshold",
    F.lit(threshold_log)
).withColumn(
    "detection_timestamp",
    F.current_timestamp()
).withColumn(
    "model_version",
    F.lit("rf_log_v1")
)

# Agregar explicaci√≥n del z-score
gold_anomalies = gold_anomalies.withColumn(
    "severity",
    F.when(F.col("z_score") <= 2.8, "NORMAL")
     .when((F.col("z_score") > 2.8) & (F.col("z_score") <= 3.5), "LEVE")
     .when((F.col("z_score") > 3.5) & (F.col("z_score") <= 4.5), "MODERADO")
     .when(F.col("z_score") > 4.5, "SEVERO")
     .otherwise("NORMAL")
)

print("   ‚úÖ Columnas agregadas:")
print("      - z_score: Cu√°ntos sigmas se desv√≠a del predicho")
print("      - severity: Clasificaci√≥n de severidad (NORMAL, LEVE, MODERADO, SEVERO)")
print("      - detection_timestamp: Timestamp de detecci√≥n")

# ----------------------------------------------------------------
# 5. GUARDAR EN TABLA DELTA: gold_anomalies
# ----------------------------------------------------------------
print("\n5Ô∏è‚É£ Guardando en tabla Delta...")

# Definir ruta de salida
output_path = "s3a://gold/anomalies/contract_anomalies"

# Seleccionar columnas finales
gold_anomalies_final = gold_anomalies.select(
    "valor_contrato",
    F.col("label_log").alias("valor_log"),
    F.col("prediction").alias("valor_predicho_log"),
    "desviacion_log",
    "z_score",
    "sigma_threshold",
    "anomaly_flag",
    "severity",
    "detection_timestamp",
    "model_version"
)

print(f"   Guardando en: {output_path}")

try:
    # Guardar como tabla Delta
    gold_anomalies_final.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(output_path)
    
    print("   ‚úÖ Tabla Delta guardada exitosamente")
    
    # Registrar en el cat√°logo de Spark
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS gold.anomalies
        USING DELTA
        LOCATION '{output_path}'
    """)
    
    print("   ‚úÖ Tabla registrada en cat√°logo: gold.anomalies")
    
except Exception as e:
    print(f"   ‚ö†Ô∏è  Error al guardar tabla Delta: {str(e)[:200]}")
    print("   Intentando guardar en ruta alternativa...")
    
    # Ruta alternativa local
    output_path_local = "/tmp/gold_anomalies"
    gold_anomalies_final.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(output_path_local)
    
    print(f"   ‚úÖ Tabla guardada en: {output_path_local}")
 


FASE 5: DETECCI√ìN DE AT√çPICOS (REGLA DE NEGOCIO)

1Ô∏è‚É£ Verificando sigma calculado...
   ‚úÖ sigma_log disponible: 0.776302
   ‚úÖ Threshold 2.8œÉ: 2.173647

2Ô∏è‚É£ Aplicando regla de detecci√≥n...
   ‚úÖ Regla aplicada: Desviaci√≥n > 2.1736 ‚Üí AT√çPICO

3Ô∏è‚É£ Calculando estad√≠sticas...


                                                                                


üìä RESULTADOS DE DETECCI√ìN:
   Total contratos analizados: 20,160
   Contratos AT√çPICOS: 622 (3.09%)
   Contratos LIBRES: 19,538 (96.91%)

4Ô∏è‚É£ Enriqueciendo datos...
   ‚úÖ Columnas agregadas:
      - z_score: Cu√°ntos sigmas se desv√≠a del predicho
      - severity: Clasificaci√≥n de severidad (NORMAL, LEVE, MODERADO, SEVERO)
      - detection_timestamp: Timestamp de detecci√≥n

5Ô∏è‚É£ Guardando en tabla Delta...
   Guardando en: s3a://gold/anomalies/contract_anomalies
   ‚ö†Ô∏è  Error al guardar tabla Delta: An error occurred while calling o2145.save.
: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configurat
   Intentando guardar en ruta alternativa...




   ‚úÖ Tabla guardada en: /tmp/gold_anomalies


                                                                                

## Fase 6