## Fase 2

In [3]:
# ============================================================================
# FASE 2 - OPTIMIZADO PARA SPARK 3.5.1 + DELTA LAKE 3.0
# ============================================================================

# PASO 0: REINICIAR SPARK CON VERSIONES CORRECTAS
try:
    spark.stop()
except:
    pass

import time
time.sleep(3)

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, lower, regexp_replace, translate, length, trim

from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, Word2Vec, 
    StringIndexer, OneHotEncoder, VectorAssembler,
    StandardScaler, PCA
)
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
import numpy as np

spark = (
    SparkSession.builder
    .appName("Bronze_to_Silver_Optimized")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "io.delta:delta-spark_2.12:3.0.0")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "50")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print(f" Spark {spark.version} iniciado\n")


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9a1c4cee-7c26-4670-af12-676610dc2648;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found io.delta#delta-spark_

 Spark 3.5.1 iniciado



In [2]:
# ============================================================================
# 1. LECTURA DE KAFKA
# ============================================================================

print("="*80)
print("PASO 1: LECTURA DE KAFKA")
print("="*80 + "\n")

contract_schema = StructType([
    StructField("id_contrato", StringType()),
    StructField("objeto_contrato", StringType()),
    StructField("entidad", StringType()),
    StructField("departamento", StringType()),
    StructField("municipio", StringType()),
    StructField("region", StringType()),
    StructField("codigo_unspsc", StringType()),
    StructField("descripcion_categoria", StringType()),
    StructField("valor_contrato", DoubleType()),
    StructField("duracion_dias", IntegerType()),
    StructField("fecha_firma", StringType()),
    StructField("tipo_contrato", StringType()),
    StructField("estado_contrato", StringType()),
    StructField("modalidad", StringType()),
    StructField("anno", IntegerType()),
    StructField("id_interno_sistema", StringType()),
    StructField("campo_vacio", StringType()),
    StructField("constante_1", StringType()),
    StructField("constante_2", IntegerType()),
    StructField("duplicate_id", StringType()),
    StructField("timestamp_carga", StringType())
])

print("Leyendo Kafka...")

df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "contratos-publicos") \
    .option("startingOffsets", "earliest") \
    .load()

df_bronze = df_kafka.select(
    from_json(col("value").cast("string"), contract_schema).alias("data")
).select("data.*")

df_bronze = df_bronze.cache()
total_kafka = df_bronze.count()

print(f" Mensajes: {total_kafka:,}\n")



PASO 1: LECTURA DE KAFKA

Leyendo Kafka...


[Stage 0:>                                                          (0 + 1) / 1]

 Mensajes: 50,349



                                                                                

In [3]:
# ============================================================================
# 2. ELIMINAR REDUNDANTES
# ============================================================================

print("="*80)
print("PASO 2: ELIMINAR REDUNDANTES")
print("="*80 + "\n")

redundant_columns = [
    "id_interno_sistema", "campo_vacio", "constante_1",
    "constante_2", "duplicate_id", "timestamp_carga"
]

print(f" Eliminando {len(redundant_columns)} columnas redundantes")

df_cleaned = df_bronze.drop(*redundant_columns)

print(f" Columnas restantes: {len(df_cleaned.columns)}\n")

#  LIBERAR bronze, ya no lo necesitamos
df_bronze.unpersist()

PASO 2: ELIMINAR REDUNDANTES

 Eliminando 6 columnas redundantes
 Columnas restantes: 15



DataFrame[id_contrato: string, objeto_contrato: string, entidad: string, departamento: string, municipio: string, region: string, codigo_unspsc: string, descripcion_categoria: string, valor_contrato: double, duracion_dias: int, fecha_firma: string, tipo_contrato: string, estado_contrato: string, modalidad: string, anno: int, id_interno_sistema: string, campo_vacio: string, constante_1: string, constante_2: int, duplicate_id: string, timestamp_carga: string]

In [4]:
# ============================================================================
# 3. LIMPIEZA
# ============================================================================

print("="*80)
print("PASO 3: LIMPIEZA")
print("="*80 + "\n")

df_cleaned = df_cleaned.cache()
total_cleaned = df_cleaned.count()

print(f" Registros: {total_cleaned:,}\n")

# An√°lisis de nulos optimizado
null_counts = df_cleaned.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in df_cleaned.columns
]).collect()[0].asDict()

print("Columnas con nulos:")
for col_name, null_count in sorted(null_counts.items(), key=lambda x: x[1], reverse=True):
    if null_count > 0:
        pct = (null_count / total_cleaned) * 100
        print(f"   {col_name}: {null_count:,} ({pct:.1f}%)")

# Limpieza
df_silver = df_cleaned \
    .filter(col("id_contrato").isNotNull()) \
    .filter(col("objeto_contrato").isNotNull()) \
    .filter(col("valor_contrato").isNotNull()) \
    .filter(col("valor_contrato") > 0) \
    .filter(col("fecha_firma").isNotNull()) \
    .withColumn("fecha_firma", to_date(col("fecha_firma"), "yyyy-MM-dd"))

df_silver = df_silver.cache()
total_silver = df_silver.count()

print(f"\n Limpieza:")
print(f"   Antes: {total_cleaned:,}")
print(f"   Despu√©s: {total_silver:,}")
print(f"   Descartados: {total_cleaned - total_silver:,}\n")

df_cleaned.unpersist()


PASO 3: LIMPIEZA



                                                                                

 Registros: 50,349

Columnas con nulos:
   duracion_dias: 50,349 (100.0%)

 Limpieza:
   Antes: 50,349
   Despu√©s: 50,058
   Descartados: 291



DataFrame[id_contrato: string, objeto_contrato: string, entidad: string, departamento: string, municipio: string, region: string, codigo_unspsc: string, descripcion_categoria: string, valor_contrato: double, duracion_dias: int, fecha_firma: string, tipo_contrato: string, estado_contrato: string, modalidad: string, anno: int]

In [5]:
# ============================================================================
# 4. ESTAD√çSTICAS
# ============================================================================

print("="*80)
print("PASO 4: ESTAD√çSTICAS")
print("="*80 + "\n")

print("üìä Por regi√≥n:")
df_silver.groupBy("region").count().orderBy(desc("count")).show(5)

print("\nüìä Top 5 entidades:")
df_silver.groupBy("entidad").count().orderBy(desc("count")).show(5, truncate=False)


PASO 4: ESTAD√çSTICAS

üìä Por regi√≥n:
+--------------+-----+
|        region|count|
+--------------+-----+
|Centro-Oriente|50058|
+--------------+-----+


üìä Top 5 entidades:
+-------------------------------------------------+-----+
|entidad                                          |count|
+-------------------------------------------------+-----+
|MUNICIPIO DE SOACHA.                             |3184 |
|ALCALD√çA MUNICIPAL COTA                          |1995 |
|ESE MUNICIPAL DE SOACHA JULIO CESAR PE√ëALOZA*    |1919 |
|CUNDINAMARCA-ALCALDIA MUNICIPIO MOSQUERA         |1879 |
|empresa social del estado regi√≥n de salud soacha.|1579 |
+-------------------------------------------------+-----+
only showing top 5 rows



In [6]:
# ============================================================================
# 5. GUARDAR EN DELTA LAKE
# ============================================================================

print("="*80)
print("PASO 5: GUARDAR EN DELTA LAKE")
print("="*80 + "\n")

DELTA_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"üíæ Guardando en: {DELTA_PATH}")

df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(DELTA_PATH)

print("‚úÖ Guardado exitosamente\n")

# ‚ö†Ô∏è LIBERAR todo
df_silver.unpersist()
spark.catalog.clearCache()


PASO 5: GUARDAR EN DELTA LAKE

üíæ Guardando en: /app/notebooks/delta_lake/silver_contracts




‚úÖ Guardado exitosamente



                                                                                

In [10]:
print(f"Spark version: {spark.version}")

Spark version: 3.5.1


In [7]:
# ============================================================================
# 6. VERIFICACI√ìN
# ============================================================================

print("="*80)
print("VERIFICACI√ìN FINAL")
print("="*80 + "\n")

df_verify = spark.read.format("delta").load(DELTA_PATH)
print(f"‚úÖ Registros verificados: {df_verify.count():,}")

df_verify.select("id_contrato", "entidad", "valor_contrato", "fecha_firma") \
    .show(5, truncate=False)

print("\nüéØ Fase 2 completada. Siguiente: Fase 3 - Embeddings\n")

VERIFICACI√ìN FINAL

‚úÖ Registros verificados: 50,058
+------------+----------------------------------------------------+--------------+-----------+
|id_contrato |entidad                                             |valor_contrato|fecha_firma|
+------------+----------------------------------------------------+--------------+-----------+
|011-2024    |ESE HOSPITAL SALAZAR DE VILLETA                     |2.9991E7      |2024-01-01 |
|CPS 012-2024|E.S.E HOSPITAL NUESTRA SE√ëORA DEL CARMEN DEL COLEGIO|1.07844E7     |2024-01-01 |
|CPS 017-2024|E.S.E HOSPITAL NUESTRA SE√ëORA DEL CARMEN DEL COLEGIO|1.07844E7     |2024-01-01 |
|004-2024    |ESE HOSPITAL SALAZAR DE VILLETA                     |2.016E7       |2024-01-01 |
|CPS-042-2024|empresa social del estado regi√≥n de salud soacha.   |7.5323616E7   |2024-01-01 |
+------------+----------------------------------------------------+--------------+-----------+
only showing top 5 rows


üéØ Fase 2 completada. Siguiente: Fase 3 - Embeddings



## Fase 3

In [4]:
# ============================================================================
# 1. CARGAR DATOS DESDE SILVER
# ============================================================================

print("\n" + "="*80)
print("PASO 1: CARGAR DATOS DESDE SILVER")
print("="*80 + "\n")

SILVER_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"üìñ Cargando datos desde: {SILVER_PATH}")

df_silver = spark.read.format("delta").load(SILVER_PATH)
df_silver = df_silver.cache()

total_records = df_silver.count()
print(f"‚úÖ Registros cargados: {total_records:,}\n")

print("üìã Esquema de datos:")
df_silver.printSchema()

print("\nüìä Muestra de datos:")
df_silver.select(
    "id_contrato", "objeto_contrato", "entidad", 
    "codigo_unspsc", "valor_contrato", "duracion_dias"
).show(3, truncate=False)


PASO 1: CARGAR DATOS DESDE SILVER

üìñ Cargando datos desde: /app/notebooks/delta_lake/silver_contracts




‚úÖ Registros cargados: 50,058

üìã Esquema de datos:
root
 |-- id_contrato: string (nullable = true)
 |-- objeto_contrato: string (nullable = true)
 |-- entidad: string (nullable = true)
 |-- departamento: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- region: string (nullable = true)
 |-- codigo_unspsc: string (nullable = true)
 |-- descripcion_categoria: string (nullable = true)
 |-- valor_contrato: double (nullable = true)
 |-- duracion_dias: integer (nullable = true)
 |-- fecha_firma: date (nullable = true)
 |-- tipo_contrato: string (nullable = true)
 |-- estado_contrato: string (nullable = true)
 |-- modalidad: string (nullable = true)
 |-- anno: integer (nullable = true)


üìä Muestra de datos:
+------------+---------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------+-------------+--------------+-------------+
|id_contrato |o

                                                                                

In [5]:
print("\n" + "="*80)
print("PASO 2: LIMPIEZA Y PREPARACI√ìN DE TEXTO (NORMALIZADO)")
print("="*80 + "\n")

print("üßπ Limpiando columna 'objeto_contrato' y eliminando tildes...")

# Definimos los caracteres a reemplazar
# Nota: Inclu√≠ la '√±' -> 'n' y la '√º' -> 'u'. 
# Si quieres CONSERVAR la √±, qu√≠tala de estas cadenas.
src_chars = "√°√©√≠√≥√∫√º√±"
dst_chars = "aeiouun"

df_prepared = df_silver.withColumn(
    "objeto_limpio",
    trim(                                           # 4. Quitar espacios al inicio/final
        regexp_replace(                             # 3. Colapsar espacios m√∫ltiples
            regexp_replace(                         # 2. Eliminar caracteres especiales
                translate(                          # 1. Reemplazar tildes
                    lower(col("objeto_contrato")),  # 0. Convertir a min√∫sculas
                    src_chars, 
                    dst_chars
                ),
                "[^a-z0-9\\s]", " " # Solo deja letras a-z (sin tildes), n√∫meros y espacios
            ),
            "\\s+", " "
        )
    )
)

# Filtrar textos muy cortos
df_prepared = df_prepared.filter(length(col("objeto_limpio")) >= 10)

print(f" Registros despu√©s de limpieza: {df_prepared.count():,}")

print("\n Ejemplo de texto limpio (Sin tildes):")
df_prepared.select("objeto_contrato", "objeto_limpio").show(5, truncate=50)


PASO 2: LIMPIEZA Y PREPARACI√ìN DE TEXTO (NORMALIZADO)

üßπ Limpiando columna 'objeto_contrato' y eliminando tildes...




 Registros despu√©s de limpieza: 50,058

 Ejemplo de texto limpio (Sin tildes):
+--------------------------------------------------+--------------------------------------------------+
|                                   objeto_contrato|                                     objeto_limpio|
+--------------------------------------------------+--------------------------------------------------+
|PRESTACION DE SERVICIOS ASISTENCIALES COMO MEDI...|prestacion de servicios asistenciales como medi...|
|PRESTAR APOYO AL PROCESO ASISTENCIAL EN EL √ÅREA...|prestar apoyo al proceso asistencial en el area...|
|PRESTAR APOYO AL PROCESO ASISTENCIAL EN EL √ÅREA...|prestar apoyo al proceso asistencial en el area...|
|APOYO PROFESIONAL A LA GESTI√ìN ASISTENCIAL COMO...|apoyo profesional a la gestion asistencial como...|
|PRESTACI√ìN DE SERVICIOS COMO PROFESIONAL EN MED...|prestacion de servicios como profesional en med...|
+--------------------------------------------------+--------------------------------

                                                                                

In [6]:
# ============================================================================
# PASO 3.1: DEFINIR STOPWORDS Y TOKENIZAR
# ============================================================================

print("üî§ PASO 3.1: Tokenizaci√≥n y Stopwords")
print("-" * 80 + "\n")

# Stopwords especializados para contratos
stopwords_es = [
    # B√°sicos espa√±ol
    "el", "la", "de", "que", "y", "a", "en", "un", "ser", "se", "no",
    "por", "con", "su", "para", "como", "estar", "tener", "le", "lo",
    "pero", "hacer", "o", "este", "otro", "ese", "si", "ya", "ver",
    "dar", "muy", "sin", "sobre", "tambi√©n", "hasta", "a√±o", "entre",
    "del", "al", "los", "las", "uno", "una", "unos", "unas",
    # Espec√≠ficos de contratos p√∫blicos
    "contrato", "contratos", "objeto", "prestacion", "prestaci√≥n",
    "servicio", "servicios", "suministro", "ejecucion", "ejecuci√≥n",
    "acuerdo", "establecido", "pliego", "condiciones", "especificaciones",
    "entidad", "contratante", "contratista", "plazo", "termino"
]

print(f"   ‚Ä¢ Stopwords definidos: {len(stopwords_es)} (b√°sicos + contratos)")

# Tokenizaci√≥n
tokenizer = Tokenizer(inputCol="objeto_limpio", outputCol="palabras")
df_tokenized = tokenizer.transform(df_prepared)

# Remover stopwords
remover = StopWordsRemover(
    inputCol="palabras", 
    outputCol="palabras_sin_stopwords",
    stopWords=stopwords_es
)
df_filtered = remover.transform(df_tokenized)

print("   ‚úÖ Tokenizaci√≥n y stopwords completados\n")

print("üìã Ejemplo:")
df_filtered.select("objeto_limpio", "palabras_sin_stopwords").show(2, truncate=70)


üî§ PASO 3.1: Tokenizaci√≥n y Stopwords
--------------------------------------------------------------------------------

   ‚Ä¢ Stopwords definidos: 65 (b√°sicos + contratos)
   ‚úÖ Tokenizaci√≥n y stopwords completados

üìã Ejemplo:
+----------------------------------------------------------------------+----------------------------------------------------------------------+
|                                                         objeto_limpio|                                                palabras_sin_stopwords|
+----------------------------------------------------------------------+----------------------------------------------------------------------+
|prestacion de servicios asistenciales como medico general en los pr...|[asistenciales, medico, general, procesos, subprocesos, hospital, s...|
|prestar apoyo al proceso asistencial en el area de auxiliar de enfe...|[prestar, apoyo, proceso, asistencial, area, auxiliar, enfermeria, ...|
+------------------------------------------

In [7]:
# ============================================================================
# PASO 3.2: LIMPIEZA Y FILTRADO
# ============================================================================

print("\nüîç PASO 3.2: Limpieza y Filtrado")
print("-" * 80 + "\n")

# Filtrar palabras muy cortas (ruido)
def clean_words(words):
    """Mantener solo palabras >= 3 caracteres"""
    if not words:
        return []
    return [w for w in words if len(w) >= 3]

clean_udf = udf(clean_words, ArrayType(StringType()))

df_filtered = df_filtered.withColumn(
    "palabras_filtradas",
    clean_udf(col("palabras_sin_stopwords"))
)

# Eliminar documentos vac√≠os
df_filtered = df_filtered.filter(size(col("palabras_filtradas")) > 0)

print("   ‚Ä¢ Filtro aplicado: palabras >= 3 caracteres")
print("   ‚Ä¢ Documentos vac√≠os eliminados")
print("   ‚úÖ Limpieza completada\n")


üîç PASO 3.2: Limpieza y Filtrado
--------------------------------------------------------------------------------

   ‚Ä¢ Filtro aplicado: palabras >= 3 caracteres
   ‚Ä¢ Documentos vac√≠os eliminados
   ‚úÖ Limpieza completada



In [8]:
# ============================================================================
# 4. GENERACI√ìN DE EMBEDDINGS CON WORD2VEC
# ============================================================================

print("\n" + "="*80)
print("PASO 4: GENERACI√ìN DE EMBEDDINGS CON WORD2VEC")
print("="*80 + "\n")

print("üî¢ Entrenando modelo Word2Vec...")
print("   - Vector size: 100 dimensiones")
print("   - Min word count: 2 (palabras que aparecen al menos 2 veces)")
print("   - Iterations: 10\n")

# Configurar Word2Vec
word2vec = Word2Vec(
    vectorSize=100,
    minCount=2,
    maxIter=10,
    seed=42,
    inputCol="palabras_filtradas",
    outputCol="embedding_raw"
)

# Entrenar Word2Vec
print("‚è≥ Entrenando (esto puede tardar 1-2 minutos)...")
word2vec_model = word2vec.fit(df_filtered)

# Aplicar el modelo
df_embeddings = word2vec_model.transform(df_filtered)

print("‚úÖ Word2Vec entrenado y aplicado")

# Estad√≠sticas del vocabulario
vocab_size = len(word2vec_model.getVectors().collect())
print(f"\nüìä Tama√±o del vocabulario: {vocab_size:,} palabras √∫nicas")

print("\nüìã Ejemplo de embeddings (primeros 10 valores):")

# SOLUCI√ìN: Crear UDF para convertir Vector a Array
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.linalg import VectorUDT, Vectors

# UDF para convertir Vector de Spark ML a Array
def vector_to_array(vector):
    return vector.toArray().tolist() if vector is not None else None

vector_to_array_udf = udf(vector_to_array, ArrayType(DoubleType()))

# Convertir el embedding a array y mostrar primeros 10 valores
df_embeddings.select(
    "id_contrato",
    slice(vector_to_array_udf(col("embedding_raw")), 1, 10).alias("embedding_muestra")
).show(2, truncate=False)

# Opcional: Ver el tama√±o completo del embedding
print("\nüìè Verificando dimensi√≥n del embedding:")
df_embeddings.select(
    "id_contrato",
    size(vector_to_array_udf(col("embedding_raw"))).alias("dimension_embedding")
).show(5)


PASO 4: GENERACI√ìN DE EMBEDDINGS CON WORD2VEC

üî¢ Entrenando modelo Word2Vec...
   - Vector size: 100 dimensiones
   - Min word count: 2 (palabras que aparecen al menos 2 veces)
   - Iterations: 10

‚è≥ Entrenando (esto puede tardar 1-2 minutos)...


                                                                                

‚úÖ Word2Vec entrenado y aplicado

üìä Tama√±o del vocabulario: 9,249 palabras √∫nicas

üìã Ejemplo de embeddings (primeros 10 valores):


                                                                                

+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id_contrato |embedding_muestra                                                                                                                                                                                                       |
+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|011-2024    |[0.199337643571198, -0.11567123653367162, -0.06517249078024179, -0.3478297144174576, 0.1609452865086496, 0.04875763412564993, -0.3183848101180047, -0.17065105272922665, 0.07957734051160514, -0.03430204698815942]     |
|CPS 012-2024|[-0.08867999973396459, -0.09253484794559577, -0.1981626600

[Stage 52:>                                                         (0 + 1) / 1]

+------------+-------------------+
| id_contrato|dimension_embedding|
+------------+-------------------+
|    011-2024|                100|
|CPS 012-2024|                100|
|CPS 017-2024|                100|
|    004-2024|                100|
|CPS-042-2024|                100|
+------------+-------------------+
only showing top 5 rows



                                                                                

In [13]:
from pyspark.sql import functions as F

def target_encode_smooth(df, cat_col, target_col, m=50):
    """
    James‚ÄìStein Smoothed Target Encoding para PySpark.
    df: DataFrame
    cat_col: columna categ√≥rica (string)
    target_col: variable objetivo (num√©rica continua)
    m: par√°metro de suavizado (entre 10 y 200)
    """

    # 1. Promedio global del target
    global_mean = df.agg(F.mean(target_col)).first()[0]

    # 2. Estad√≠sticas por categor√≠a
    stats = (
        df.groupBy(cat_col)
          .agg(
              F.mean(target_col).alias("cat_mean"),
              F.count(target_col).alias("cat_count")
          )
          .withColumn(
              f"{cat_col}_te",
              (F.col("cat_count") * F.col("cat_mean") + m * F.lit(global_mean)) 
              / (F.col("cat_count") + m)
          )
          .select(cat_col, f"{cat_col}_te")
    )
    # 3. Join de vuelta al DataFrame
    df_encoded = df.join(stats, on=cat_col, how="left")

    return df_encoded

In [10]:
print("\n" + "="*80)
print("PASO 5: TARGET ENCODING (Suavizado James‚ÄìStein)")
print("="*80 + "\n")

df_te = df_clean  # viene del m√≥dulo anterior

categorical_cols = ["entidad", "descripcion_categoria", "municipio", "modalidad", "fecha_firma","estado_contrato"]

for col_name in categorical_cols:
    print(f"‚è≥ Codificando {col_name}...")
    df_te = target_encode_smooth(df_te, col_name, target_col="valor_contrato", m=50)
    print(f"‚úÖ {col_name} codificado -> {col_name}_te")

print("\nüéâ Codificaci√≥n¬†completada!")


PASO 5: TARGET ENCODING (Suavizado James‚ÄìStein)



NameError: name 'df_clean' is not defined

In [62]:
# ============================================================================
# VERIFICAR QUE TENEMOS LAS COLUMNAS DE TARGET ENCODING
# ============================================================================

print("üìä Verificando columnas de Target Encoding...")

# Columnas que deber√≠an existir despu√©s del Target Encoding
te_columns = ["entidad_te", "codigo_unspsc_te", "departamento_te", "modalidad_te"]

missing_cols = [col for col in te_columns if col not in df_te.columns]

if missing_cols:
    print(f"‚ö†  ERROR: Faltan columnas: {missing_cols}")
    print("   Ejecuta primero el PASO 5 del c√≥digo original (Target Encoding)\n")
else:
    print(f"‚úì Todas las columnas TE disponibles: {te_columns}\n")

# ============================================================================
# MOSTRAR ESTAD√çSTICAS DE TARGET ENCODING
# ============================================================================

print("üìä Estad√≠sticas de Target Encoding:\n")

for col in te_columns:
    stats = df_te.select(
        F.min(col).alias("min"),
        F.max(col).alias("max"),
        F.mean(col).alias("mean"),
        F.stddev(col).alias("std")
    ).collect()[0]

    print(f"  {col}:")
    print(f"    Min : {stats['min']:.4f}")
    print(f"    Max : {stats['max']:.4f}")
    print(f"    Mean: {stats['mean']:.4f}")
    print(f"    Std : {stats['std']:.4f}")
    print()


üìä Verificando columnas de Target Encoding...
‚úì Todas las columnas TE disponibles: ['entidad_te', 'codigo_unspsc_te', 'departamento_te', 'modalidad_te']

üìä Estad√≠sticas de Target Encoding:



                                                                                

  entidad_te:
    Min : 15491513.5024
    Max : 655285394.1297
    Mean: 102528222.0446
    Std : 97687942.7203



                                                                                

  codigo_unspsc_te:
    Min : 100309896.7452
    Max : 100309896.7452
    Mean: 100309896.7451
    Std : 0.0000



                                                                                

  departamento_te:
    Min : 100309896.7452
    Max : 100309896.7452
    Mean: 100309896.7451
    Std : 0.0000





  modalidad_te:
    Min : 32414588.3887
    Max : 3635469506.6244
    Mean: 92542889.5217
    Std : 322995855.6501



                                                                                

In [66]:
# ============================================================================
# PASO 6 MODIFICADO: ENSAMBLAR FEATURES USANDO TARGET ENCODING
# ============================================================================

print("="*80)
print("PASO 6 MODIFICADO: ENSAMBLAR FEATURES CON TARGET ENCODING")
print("="*80 + "\n")

# Renombrar df_te a df_encoded para mantener compatibilidad
df_encoded = df_te

# Manejar duracion_dias nulos (imputar con 0)
df_encoded = df_encoded.fillna({"duracion_dias": 0})

# ============================================================================
# CONSTRUIR LISTA DE COLUMNAS - USANDO TARGET ENCODING
# ============================================================================

input_cols = ["embedding_raw"]  # Siempre incluimos el embedding

print("üìä Features a ensamblar:")
print(f"  ‚úì embedding_raw (100 dims)")

# 1. AGREGAR TARGET ENCODING
target_encoding_cols = ["entidad_te", "codigo_unspsc_te", "departamento_te", "modalidad_te"]

for col in target_encoding_cols:
    input_cols.append(col)
    print(f"  ‚úì {col} (Target Encoding)")

# 2. AGREGAR VARIABLE NUM√âRICA
input_cols.append("duracion_dias")
print(f"  ‚úì duracion_dias (num√©rica)")

print(f"\nüìä Total columnas a ensamblar: {len(input_cols)}")
print(f"   Input cols: {input_cols}\n")

# ============================================================================
# ENSAMBLAR FEATURES
# ============================================================================

assembler = VectorAssembler(
    inputCols=input_cols,
    outputCol="features_raw",
    handleInvalid="skip"
)

print("üìä Ensamblando features...")
df_assembled = assembler.transform(df_encoded)
print("‚úì Features ensambladas en un solo vector\n")

# ============================================================================
# VERIFICAR DIMENSI√ìN DEL VECTOR
# ============================================================================

sample_features = df_assembled.select("features_raw").first()[0]
feature_dim = len(sample_features)

print("üìä RESUMEN DE DIMENSIONALIDAD:")
print("="*60)
print(f"  - Embedding Word2Vec:        100 dimensiones")
print(f"  - Target Encoding (4 vars):    4 dimensiones")
print(f"  - duracion_dias:               1 dimensi√≥n")
print("="*60)
print(f"  TOTAL:                       {feature_dim} dimensiones")
print(f"  (vs 183 dimensiones anteriores con One-Hot)")
print(f"  Reducci√≥n: {(1 - feature_dim/183)*100:.1f}%")
print("="*60)

# ============================================================================
# VERIFICACI√ìN DE CALIDAD
# ============================================================================

print("\nüìä Verificaci√≥n de calidad:")

# 1. Verificar que no hay valores nulos en features
null_features = df_assembled.filter(F.col("features_raw").isNull()).count()
print(f"  ‚úì Features nulos: {null_features} (debe ser 0)")

# 2. Mostrar muestra
print("\nüìä Muestra de datos ensamblados:")
df_assembled.select(
    "id_contrato",
    "entidad_te",
    "codigo_unspsc_te",
    "departamento_te", 
    "modalidad_te",
    "duracion_dias"
).show(5, truncate=False)


PASO 6 MODIFICADO: ENSAMBLAR FEATURES CON TARGET ENCODING

üìä Features a ensamblar:
  ‚úì embedding_raw (100 dims)
  ‚úì entidad_te (Target Encoding)
  ‚úì codigo_unspsc_te (Target Encoding)
  ‚úì departamento_te (Target Encoding)
  ‚úì modalidad_te (Target Encoding)
  ‚úì duracion_dias (num√©rica)

üìä Total columnas a ensamblar: 6
   Input cols: ['embedding_raw', 'entidad_te', 'codigo_unspsc_te', 'departamento_te', 'modalidad_te', 'duracion_dias']

üìä Ensamblando features...
‚úì Features ensambladas en un solo vector



                                                                                

üìä RESUMEN DE DIMENSIONALIDAD:
  - Embedding Word2Vec:        100 dimensiones
  - Target Encoding (4 vars):    4 dimensiones
  - duracion_dias:               1 dimensi√≥n
  TOTAL:                       105 dimensiones
  (vs 183 dimensiones anteriores con One-Hot)
  Reducci√≥n: 42.6%

üìä Verificaci√≥n de calidad:


                                                                                

  ‚úì Features nulos: 0 (debe ser 0)

üìä Muestra de datos ensamblados:


[Stage 859:>                                                        (0 + 1) / 1]

+------------+--------------------+--------------------+--------------------+--------------------+-------------+
|id_contrato |entidad_te          |codigo_unspsc_te    |departamento_te     |modalidad_te        |duracion_dias|
+------------+--------------------+--------------------+--------------------+--------------------+-------------+
|011-2024    |2.7179094349785384E7|1.0030989674523553E8|1.0030989674523553E8|4.9149497466489606E7|0            |
|CPS 012-2024|2.605177241000303E7 |1.0030989674523553E8|1.0030989674523553E8|4.9149497466489606E7|0            |
|CPS 017-2024|2.605177241000303E7 |1.0030989674523553E8|1.0030989674523553E8|4.9149497466489606E7|0            |
|004-2024    |2.7179094349785384E7|1.0030989674523553E8|1.0030989674523553E8|4.9149497466489606E7|0            |
|CPS-042-2024|1.5150127860666776E8|1.0030989674523553E8|1.0030989674523553E8|4.9149497466489606E7|0            |
+------------+--------------------+--------------------+--------------------+-------------------

                                                                                

In [83]:
# ============================================================================
# 7. NORMALIZACI√ìN DE FEATURES
# ============================================================================
print("\n" + "="*80)
print("PASO 7: NORMALIZACI√ìN (StandardScaler)")
print("="*80 + "\n")

print("üìè Aplicando StandardScaler para normalizar features...")

from pyspark.ml.feature import StandardScaler

# Normalizar todas las features num√©ricas incluidas las embeddings
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features_scaled",
    withStd=True,
    withMean=True  # Centrar los datos para PCA
)

scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)

print("‚úÖ Features normalizadas")


PASO 7: NORMALIZACI√ìN (StandardScaler)

üìè Aplicando StandardScaler para normalizar features...




‚úÖ Features normalizadas


                                                                                

In [84]:
# ============================================================================
# PASO 4: CORRELACI√ìN EFICIENTE (SIN EXPANDIR COLUMNAS)
# ============================================================================

print("="*80)
print("PASO 4: AN√ÅLISIS DE CORRELACI√ìN EFICIENTE")
print("="*80 + "\n")

print("üìä M√©todo: Calcular correlaci√≥n usando vectores completos")
print("   (Evita expandir 100 columnas ‚Üí mucho m√°s r√°pido)\n")

# Crear dataset con features + target
df_with_target = df_scaled.withColumn(
    "target_vector",
    F.array(F.col("valor_contrato"))
)

# Calcular correlaci√≥n solo de variables categ√≥ricas (r√°pido)
print("‚è≥ Correlaci√≥n de variables categ√≥ricas...")

cat_correlations = {}

for var in te_columns + ["duracion_dias"]:
    assembler_pair = VectorAssembler(
        inputCols=[var, "valor_contrato"],
        outputCol="features_pair",
        handleInvalid="skip"
    )
    
    df_pair = assembler_pair.transform(df_scaled)
    corr_matrix = Correlation.corr(df_pair, "features_pair", "pearson").collect()[0][0]
    corr_value = corr_matrix.toArray()[0, 1]
    
    cat_correlations[var] = corr_value
    print(f"  {var:<25} ‚Üí {corr_value:>7.4f}")

print("\n‚úì Correlaciones categ√≥ricas¬†calculadas\n")


PASO 4: AN√ÅLISIS DE CORRELACI√ìN EFICIENTE

üìä M√©todo: Calcular correlaci√≥n usando vectores completos
   (Evita expandir 100 columnas ‚Üí mucho m√°s r√°pido)

‚è≥ Correlaci√≥n de variables categ√≥ricas...


                                                                                

  entidad_te                ‚Üí  0.0977


                                                                                

  codigo_unspsc_te          ‚Üí     nan


                                                                                

  departamento_te           ‚Üí     nan


                                                                                

  modalidad_te              ‚Üí  0.3198




  duracion_dias             ‚Üí     nan

‚úì Correlaciones categ√≥ricas¬†calculadas



                                                                                

In [85]:
# ============================================================================
# PASO 5: CORRELACI√ìN DEL EMBEDDING (M√âTODO EFICIENTE)
# ============================================================================

print("="*80)
print("PASO 5: CORRELACI√ìN DEL EMBEDDING (M√©todo Muestral)")
print("="*80 + "\n")

print("üìä Estrategia: Calcular correlaci√≥n en una MUESTRA peque√±a")
print("   (Mucho m√°s r√°pido, resultados representativos)\n")

# Tomar muestra del 10% (suficiente para correlaciones)
SAMPLE_FRACTION = 0.1
df_sample = df_scaled.sample(withReplacement=False, fraction=SAMPLE_FRACTION, seed=42)

sample_size = df_sample.count()
print(f"‚è≥ Muestra: {sample_size:,} registros ({SAMPLE_FRACTION*100:.0f}%)")

# Convertir a Pandas para c√°lculo eficiente
print("‚è≥ Calculando correlaciones de embedding...\n")

# Extraer embeddings y target a arrays numpy
data_sample = df_sample.select("embedding_raw", "valor_contrato").collect()

embeddings_array = np.array([row["embedding_raw"].toArray() for row in data_sample])
target_array = np.array([row["valor_contrato"] for row in data_sample])

# Calcular correlaci√≥n de cada dimensi√≥n con el target
embedding_correlations = {}

for i in range(100):
    correlation = np.corrcoef(embeddings_array[:, i], target_array)[0, 1]
    embedding_correlations[f"emb_{i}"] = correlation
    
    if (i + 1) % 20 == 0:
        print(f"  Procesadas {i + 1}/100 dimensiones...")

print("\n‚úì Correlaciones de embedding calculadas\n")

# Consolidar todas las correlaciones
all_correlations = {**cat_correlations, **embedding_correlations}


PASO 5: CORRELACI√ìN DEL EMBEDDING (M√©todo Muestral)

üìä Estrategia: Calcular correlaci√≥n en una MUESTRA peque√±a
   (Mucho m√°s r√°pido, resultados representativos)



                                                                                

‚è≥ Muestra: 4,964 registros (10%)
‚è≥ Calculando correlaciones de embedding...



                                                                                

  Procesadas 20/100 dimensiones...
  Procesadas 40/100 dimensiones...
  Procesadas 60/100 dimensiones...
  Procesadas 80/100 dimensiones...
  Procesadas 100/100 dimensiones...

‚úì Correlaciones de embedding calculadas



In [87]:
# ============================================================================
# PASO 6: AN√ÅLISIS Y SELECCI√ìN
# ============================================================================

import builtins  # para asegurar abs() nativo

print("="*80)
print("PASO 6: SELECCI√ìN DE VARIABLES")
print("="*80 + "\n")

# Estad√≠sticas
corr_values = list(all_correlations.values())
print("üìä Estad√≠sticas de correlaciones:")
print(f"  - Min:    {np.nanmin(corr_values):.4f}")
print(f"  - Max:    {np.nanmax(corr_values):.4f}")
print(f"  - Mean:   {np.nanmean(corr_values):.4f}")
print(f"  - Median: {np.nanmedian(corr_values):.4f}")

# Top 15
print("\nüìä Top 15 variables con mayor correlaci√≥n absoluta:")
sorted_all = sorted(
    all_correlations.items(),
    key=lambda x: builtins.abs(x[1]),
    reverse=True
)

for i, (var, corr) in enumerate(sorted_all[:15], 1):
    print(f"  {i:2d}. {var:<25} ‚Üí {corr:>7.4f}")

# Umbral
THRESHOLD = 0.05
print(f"\nüìä Umbral de selecci√≥n: |r| >= {THRESHOLD}\n")

selected_vars = {var: corr for var, corr in all_correlations.items() 
                 if builtins.abs(corr) >= THRESHOLD and not np.isnan(corr)}
rejected_vars = {var: corr for var, corr in all_correlations.items() 
                 if builtins.abs(corr) < THRESHOLD or np.isnan(corr)}

print(f"‚úÖ Variables SELECCIONADAS: {len(selected_vars)}")
print(f"‚ùå Variables RECHAZADAS: {len(rejected_vars)}")

# Desglosar
selected_cat = [v for v in selected_vars.keys() if not v.startswith("emb_")]
selected_emb = [v for v in selected_vars.keys() if v.startswith("emb_")]

print(f"\n  Categ√≥ricas/Num√©ricas: {len(selected_cat)}")
print(f"  Embeddings: {len(selected_emb)}")

print(f"\nüìä Variables categ√≥ricas/num√©ricas:")
for var in selected_cat:
    print(f"  {'‚úì' if var in selected_vars else '‚úó'} {var:<25} ‚Üí {all_correlations[var]:>7.4f}")


PASO 6: SELECCI√ìN DE VARIABLES

üìä Estad√≠sticas de correlaciones:
  - Min:    -0.1043
  - Max:    0.3198
  - Mean:   -0.0034
  - Median: -0.0068

üìä Top 15 variables con mayor correlaci√≥n absoluta:
   1. codigo_unspsc_te          ‚Üí     nan
   2. departamento_te           ‚Üí     nan
   3. modalidad_te              ‚Üí  0.3198
   4. duracion_dias             ‚Üí     nan
   5. emb_1                     ‚Üí  0.1071
   6. emb_36                    ‚Üí -0.1043
   7. emb_43                    ‚Üí -0.1027
   8. entidad_te                ‚Üí  0.0977
   9. emb_90                    ‚Üí -0.0923
  10. emb_95                    ‚Üí -0.0912
  11. emb_59                    ‚Üí  0.0840
  12. emb_83                    ‚Üí  0.0792
  13. emb_81                    ‚Üí  0.0787
  14. emb_11                    ‚Üí  0.0749
  15. emb_15                    ‚Üí -0.0728

üìä Umbral de selecci√≥n: |r| >= 0.05

‚úÖ Variables SELECCIONADAS: 26
‚ùå Variables RECHAZADAS: 79

  Categ√≥ricas/Num√©ricas: 2
  E

In [88]:
# ============================================================================
# PASO 7: FILTRAR EMBEDDING
# ============================================================================

print("\n" + "="*80)
print("PASO 7: FILTRAR FEATURES")
print("="*80 + "\n")

if len(selected_emb) > 0:
    selected_emb_indices = sorted([int(var.split("_")[1]) for var in selected_emb])
    
    print(f"üìä Dimensiones de embedding seleccionadas: {len(selected_emb_indices)}/100")
    
    # UDF para filtrar embedding (eficiente)
    from pyspark.ml.linalg import VectorUDT
    
    def filter_embedding_udf(indices):
        def filter_func(vector):
            if vector is None:
                return Vectors.dense([0.0] * len(indices))
            return Vectors.dense([float(vector[i]) for i in indices])
        return F.udf(filter_func, VectorUDT())
    
    df_filtered = df_scaled.withColumn(
        "embedding_filtered",
        filter_embedding_udf(selected_emb_indices)(F.col("embedding_raw"))
    )
    
    embedding_dim = len(selected_emb_indices)
    input_cols_filtered = ["embedding_filtered"] + selected_cat
else:
    print("‚ö†  Ninguna dimensi√≥n de embedding supera el umbral")
    df_filtered = df_scaled
    embedding_dim = 0
    input_cols_filtered = selected_cat

print(f"\nüìä Features finales:")
print(f"  - Embedding: {embedding_dim} dims")
print(f"  - Otras: {len(selected_cat)} dims")
print(f"  - TOTAL: {embedding_dim + len(selected_cat)} dims")
print(f"  - Reducci√≥n: {(1 - (embedding_dim + len(selected_cat))/feature_dim)*100:.1f}%\n")

# Ensamblar features filtradas
assembler_filtered = VectorAssembler(
    inputCols=input_cols_filtered,
    outputCol="features_selected",
    handleInvalid="skip"
)

df_assembled_filtered = assembler_filtered.transform(df_filtered)
selected_dim = len(df_assembled_filtered.select("features_selected").first()[0])

print("‚úì Features filtradas¬†ensambladas\n")



PASO 7: FILTRAR FEATURES

üìä Dimensiones de embedding seleccionadas: 24/100

üìä Features finales:
  - Embedding: 24 dims
  - Otras: 2 dims
  - TOTAL: 26 dims
  - Reducci√≥n: 75.2%



[Stage 2303:>                                                       (0 + 1) / 1]

‚úì Features filtradas¬†ensambladas



                                                                                

In [89]:
# ============================================================================
# PASO 8: NORMALIZAR FEATURES FILTRADAS
# ============================================================================

print("="*80)
print("PASO 8: NORMALIZAR FEATURES FILTRADAS")
print("="*80 + "\n")

scaler_filtered = StandardScaler(
    inputCol="features_selected",
    outputCol="features_scaled_filtered",
    withStd=True,
    withMean=True
)

scaler_model_filtered = scaler_filtered.fit(df_assembled_filtered)
df_scaled_filtered = scaler_model_filtered.transform(df_assembled_filtered)

print("‚úì Features filtradas normalizadas\n")


PASO 8: NORMALIZAR FEATURES FILTRADAS





‚úì Features filtradas normalizadas



                                                                                

In [90]:
# ============================================================================
# PASO 9: PCA
# ============================================================================

print("="*80)
print("PASO 9: PCA SOBRE FEATURES FILTRADAS")
print("="*80 + "\n")

pca = PCA(
    k=selected_dim,
    inputCol="features_scaled_filtered",
    outputCol="features_pca"
)

print("‚è≥ Entrenando PCA...")
pca_model = pca.fit(df_scaled_filtered)
df_pca = pca_model.transform(df_scaled_filtered)
print("‚úì PCA aplicado\n")

# Analizar varianza
explained_variance = pca_model.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)

n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f"üìä Componentes para 95% varianza: {n_components_95}")

print("\nüìä Varianza explicada:")
for i in range(builtins.min(10, selected_dim)):
    print(f"  - PC{i+1}: {explained_variance[i]:.2%}")

print("\nüìä Varianza acumulada:")
thresholds = [5, 10, 20, 30, 50, selected_dim]
for i in thresholds:
    if i <= len(cumulative_variance):
        print(f"  - {i:3d} componentes: {cumulative_variance[i-1]:.2%}")


PASO 9: PCA SOBRE FEATURES FILTRADAS

‚è≥ Entrenando PCA...




‚úì PCA aplicado

üìä Componentes para 95% varianza: 23

üìä Varianza explicada:
  - PC1: 13.02%
  - PC2: 9.70%
  - PC3: 6.45%
  - PC4: 5.84%
  - PC5: 5.48%
  - PC6: 5.11%
  - PC7: 4.53%
  - PC8: 4.37%
  - PC9: 4.10%
  - PC10: 3.83%

üìä Varianza acumulada:
  -   5 componentes: 40.49%
  -  10 componentes: 62.44%
  -  20 componentes: 90.21%
  -  26 componentes: 100.00%


                                                                                

In [78]:
# ============================================================================
# PASO 8: PCA
# ============================================================================

print("="*80)
print("PASO 8: REDUCCI√ìN DE DIMENSIONALIDAD CON PCA")
print("="*80 + "\n")

# PCA con todos los componentes para an√°lisis
pca = PCA(
    k=feature_dim,
    inputCol="features_scaled",
    outputCol="features_pca"
)

print("üìä Calculando componentes principales...")
pca_model = pca.fit(df_scaled)
df_pca = pca_model.transform(df_scaled)
print("‚úì PCA aplicado\n")

# ============================================================================
# ANALIZAR VARIANZA EXPLICADA
# ============================================================================

explained_variance = pca_model.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)

# Encontrar componentes para 95% varianza
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f"üìä Componentes necesarios para 95% varianza: {n_components_95}")

print("\nüìä Varianza explicada por componentes:")
import builtins  # Usar min de Python, no de PySpark
for i in range(builtins.min(10, feature_dim)):
    print(f"  - PC{i+1}: {explained_variance[i]:.2%}")

print("\nüìä Varianza acumulada:")
thresholds = [5, 10, 20, 30, 50,60,77, feature_dim]
for i in thresholds:
    if i <= len(cumulative_variance):
        print(f"  - {i:3d} componentes: {cumulative_variance[i-1]:.2%}")


PASO 8: REDUCCI√ìN DE DIMENSIONALIDAD CON PCA

üìä Calculando componentes principales...




‚úì PCA aplicado

üìä Componentes necesarios para 95% varianza: 77

üìä Varianza explicada por componentes:
  - PC1: 8.62%
  - PC2: 6.30%
  - PC3: 5.18%
  - PC4: 4.61%
  - PC5: 3.80%
  - PC6: 3.33%
  - PC7: 2.76%
  - PC8: 2.58%
  - PC9: 2.42%
  - PC10: 2.39%

üìä Varianza acumulada:
  -   5 componentes: 28.50%
  -  10 componentes: 41.98%
  -  20 componentes: 58.67%
  -  30 componentes: 69.83%
  -  50 componentes: 84.26%
  -  60 componentes: 89.17%
  -  77 componentes: 95.21%
  - 105 componentes: 100.00%


                                                                                

In [79]:
# ============================================================================
# 9. PREPARAR DATASET FINAL PARA MODELADO
# ============================================================================

print("\n" + "="*80)
print("PASO 9: PREPARAR DATASET FINAL")
print("="*80 + "\n")

# Seleccionar columnas necesarias para el modelo
df_final = df_pca.select(
    "id_contrato",
    "objeto_contrato",
    "entidad",
    "departamento",
    "region",
    "codigo_unspsc",
    "valor_contrato",        # TARGET
    "duracion_dias",
    "fecha_firma",
    "features_pca",          # FEATURES para el modelo
    "features_scaled"        # Backup sin PCA
)

# Cachear para operaciones posteriores
df_final = df_final.cache()
total_final = df_final.count()

print(f"‚úÖ Dataset final preparado: {total_final:,} registros")

print("\nüìã Esquema del dataset final:")
df_final.printSchema()

print("\nüìä Estad√≠sticas del target (valor_contrato):")
df_final.select("valor_contrato").describe().show()


PASO 9: PREPARAR DATASET FINAL





‚úÖ Dataset final preparado: 50,058 registros

üìã Esquema del dataset final:
root
 |-- id_contrato: string (nullable = true)
 |-- objeto_contrato: string (nullable = true)
 |-- entidad: string (nullable = false)
 |-- departamento: string (nullable = false)
 |-- region: string (nullable = true)
 |-- codigo_unspsc: string (nullable = false)
 |-- valor_contrato: double (nullable = true)
 |-- duracion_dias: integer (nullable = false)
 |-- fecha_firma: date (nullable = true)
 |-- features_pca: vector (nullable = true)
 |-- features_scaled: vector (nullable = true)


üìä Estad√≠sticas del target (valor_contrato):
+-------+--------------------+
|summary|      valor_contrato|
+-------+--------------------+
|  count|               50058|
|   mean|1.0030989674523553E8|
| stddev|1.1680313663790998E9|
|    min|                 1.0|
|    max|    1.50838540149E11|
+-------+--------------------+



                                                                                

In [82]:
# ============================================================================
# 10. AN√ÅLISIS DE CORRELACIONES
# ============================================================================

print("\n" + "="*80)
print("PASO 10: AN√ÅLISIS DE CORRELACIONES")
print("="*80 + "\n")

print("üìä Calculando correlaciones de features PCA con valor_contrato...")

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf as spark_udf, col

# Funci√≥n para agregar target al vector PCA
def add_target_to_vector(features, target):
    return Vectors.dense(list(features.toArray()) + [float(target)])

add_target_udf = spark_udf(add_target_to_vector, VectorUDT())

df_corr = df_final.withColumn(
    "features_with_target",
    add_target_udf(col("features_pca"), col("valor_contrato"))
)

# Calcular matriz de correlaci√≥n
print("‚è≥ Calculando matriz de correlaci√≥n...")
correlation_matrix = Correlation.corr(df_corr, "features_with_target", "pearson")

# Extraer matriz como array numpy
corr_array = correlation_matrix.collect()[0][0].toArray()

# Correlaciones del target (√∫ltima fila, excepto √∫ltimo elemento)
target_correlations = corr_array[-1, :-1]

print("\nüìä Correlaciones de PCA con valor_contrato:")
max_idx = np.argmax(np.abs(target_correlations))

print(f"   - Componente m√°s correlacionado: PC{max_idx+1} ({target_correlations[max_idx]:.3f})")
print(f"   - Top 5 componentes:")

top_5_indices = np.argsort(np.abs(target_correlations))[-5:][::-1]
for idx in top_5_indices:
    print(f"     PC{idx+1}: {target_correlations[idx]:.3f}")



PASO 10: AN√ÅLISIS DE CORRELACIONES

üìä Calculando correlaciones de features PCA con valor_contrato...
‚è≥ Calculando matriz de correlaci√≥n...





üìä Correlaciones de PCA con valor_contrato:
   - Componente m√°s correlacionado: PC103 (nan)
   - Top 5 componentes:
     PC105: nan
     PC103: nan
     PC104: nan
     PC24: -0.149
     PC29: -0.120


                                                                                

In [None]:
# ============================================================================
# 11. GUARDAR DATOS PROCESADOS (GOLD LAYER)
# ============================================================================

print("\n" + "="*80)
print("PASO 11: GUARDAR EN DELTA LAKE (GOLD LAYER)")
print("="*80 + "\n")

GOLD_PATH = "/app/notebooks/delta_lake/gold_features"

print(f"üíæ Guardando en: {GOLD_PATH}")

df_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(GOLD_PATH)

print("‚úÖ Datos guardados exitosamente")

# Liberar cache
df_silver.unpersist()
df_final.unpersist()

# ============================================================================
# 12. GUARDAR MODELOS DE TRANSFORMACI√ìN
# ============================================================================

print("\n" + "="*80)
print("PASO 12: GUARDAR MODELOS DE TRANSFORMACI√ìN")
print("="*80 + "\n")

MODELS_PATH = "/app/notebooks/models"

print(f"üíæ Guardando modelos en: {MODELS_PATH}")

# Guardar Word2Vec model
word2vec_model.save(f"{MODELS_PATH}/word2vec_model")
print("   ‚úÖ Word2Vec model guardado")

# Guardar PCA model
pca_model.save(f"{MODELS_PATH}/pca_model")
print("   ‚úÖ PCA model guardado")

# Guardar Scaler model
scaler_model.save(f"{MODELS_PATH}/scaler_model")
print("   ‚úÖ StandardScaler model guardado")

# ============================================================================
# 13. REPORTE FINAL
# ============================================================================

print("\n" + "="*80)
print("‚úÖ FASE 3 COMPLETADA - REPORTE FINAL")
print("="*80 + "\n")

print("üìä RESUMEN DEL PROCESAMIENTO:\n")
print(f"   ‚úÖ Registros procesados: {total_final:,}")
print(f"   ‚úÖ Vocabulario Word2Vec: {vocab_size:,} palabras")
print(f"   ‚úÖ Dimensi√≥n embeddings: 100")
print(f"   ‚úÖ Dimensi√≥n features total: {feature_dim}")
print(f"   ‚úÖ Dimensi√≥n despu√©s de PCA: {k_components}")
print(f"   ‚úÖ Varianza explicada: {cumulative_variance[-1]:.2%}")

print("\nüìÅ ARCHIVOS GENERADOS:")
print(f"   - Dataset: {GOLD_PATH}")
print(f"   - Modelos: {MODELS_PATH}/")

print("\nüéØ SIGUIENTE PASO:")
print("   Fase 4: Entrenar modelo de regresi√≥n para predecir valor_contrato")

print("\n" +¬†"="*80¬†+¬†"\n")