In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, broadcast, current_timestamp, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, TimestampType
import sparknlp

# Inicializaci√≥n de SPARK

In [2]:
# --- IMPORTS Y CONFIGURACI√ìN ---
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import sparknlp 
# Importa el resto de las funciones necesarias para tu pipeline de NLP

# --- CONFIGURACI√ìN DE PAQUETES ---
SPARK_MASTER = "spark://0.0.0.0:7077" # Direcci√≥n de tu Spark Master dentro de Docker
SPARK_NLP_VERSION = "5.3.3" 

# Lista de paquetes para Spark (Kafka, Delta, y Spark NLP)
SPARK_PACKAGES = (
    "io.delta:delta-spark_2.12:3.1.0,"
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
    f"com.johnsnowlabs.nlp:spark-nlp_2.12:{SPARK_NLP_VERSION}" 
)

# --- INICIALIZACI√ìN DE SPARK CON PAQUETES ---


# --- CREACI√ìN DE LA SPARK SESSION ---
spark = (SparkSession.builder \
    .appName("GoldLayer_BERT_Embeddings") \
    .master(SPARK_MASTER) \
    .config("spark.jars.packages", SPARK_PACKAGES) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    

    .config("spark.driver.memory", "8g") \
    
    .config("spark.executor.memory", "4g") \
    
    .getOrCreate())

print(f"‚úÖ Sesi√≥n de Spark iniciada y conectada al Master {SPARK_MASTER} con Spark NLP v{SPARK_NLP_VERSION}.")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e1ed9363-d586-4731-984d-0f9245c5e615;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in ce

‚úÖ Sesi√≥n de Spark iniciada y conectada al Master spark://0.0.0.0:7077 con Spark NLP v5.3.3.


# FASE 2

# Ingesta: Leer stream de Kafka

In [3]:
# Esquema de los datos reales del productor (8 campos clave + 6 auxiliares)
CONTRACT_SCHEMA = StructType([
    # Campos clave para ML
    StructField("id_contrato", StringType(), True),
    StructField("objeto_contrato", StringType(), True),
    StructField("entidad", StringType(), True),
    StructField("codigo_unspsc", StringType(), True), 
    StructField("duracion_dias", LongType(), True),
    StructField("valor_contrato", DoubleType(), True),
    StructField("fecha_firma", StringType(), True),
    StructField("departamento", StringType(), True),
    
    # Columnas ruidosas / auxiliares (Ser√°n eliminadas)
    StructField("nit_entidad", StringType(), True), 
    StructField("localizacion", StringType(), True),
    StructField("sector", StringType(), True), 
    StructField("es_pyme", StringType(), True),
    StructField("valor_facturado", StringType(), True), # Aunque es num√©rico, puede venir sucio
    StructField("urlproceso", StringType(), True),
])

# 1. Tarea: Leer el stream de Kafka
kafka_stream = (spark.readStream
    .format("kafka") 
    .option("kafka.bootstrap.servers", "kafka:29092")
    .option("subscribe", "contratos-publicos")
    .option("startingOffsets", "earliest") # Para procesar datos desde el inicio (si el productor ya corri√≥)
    .load()
)

print("Stream de Kafka configurado.")

Stream de Kafka configurado.


# Preparaci√≥n de los datos

In [4]:
# Definimos los departamentos que forman parte del Eje Cafetero para el filtro
regiones_eje_cafetero = [
    ("Antioquia", "Eje Cafetero"), ("Caldas", "Eje Cafetero"), 
    ("Quindio", "Eje Cafetero"), ("Risaralda", "Eje Cafetero"), 
    ("Tolima", "Eje Cafetero"), ("Valle del Cauca", "Eje Cafetero")
]
df_regiones = spark.createDataFrame(regiones_eje_cafetero).toDF("departamento_join", "macrorregion_turistica")

# Aplicamos Broadcasting para optimizar el JOIN (Broadcasting Join)
df_regiones_broadcast = broadcast(df_regiones)
print("DataFrame de Regiones (Broadcast) listo para el Join.")

DataFrame de Regiones (Broadcast) listo para el Join.


# Persistencia en Delta-Lake

In [5]:
df_silver = (kafka_stream \
    # 2. Explosi√≥n de Metadatos
    .withColumn("value_content", from_json(col("value").cast("string"), CONTRACT_SCHEMA)) \
    .select(
        col("value_content.*"),
        col("timestamp").alias("kafka_ingestion_time"), # Metadato de Kafka
        col("offset").alias("kafka_offset")             # Metadato de Kafka
    ) \
    
    # 3. Tarea: Limpieza (Eliminaci√≥n de Redundantes)
    .drop("nit_entidad", "localizacion", "sector", "es_pyme", "valor_facturado", "urlproceso") \
    
    # 3. Tarea: Cruce con Regiones (Broadcasting Join)
    .join(
        df_regiones_broadcast,
        on=df_regiones_broadcast.departamento_join == col("departamento"),
        how="inner" # INNER JOIN garantiza que solo pasen los del Eje Cafetero
    ) \
    .drop("departamento_join") \
    
    # Limpieza final de valores (solo si es necesario para asegurar tipos, aunque ya se hizo en el productor)
    .withColumn("departamento", col("departamento").cast(StringType())) \
    .withColumn("processing_time", current_timestamp())
)

In [6]:
# Rutas para el almacenamiento Delta Lake
DELTA_LAKE_PATH = "/opt/spark/data/delta/silver_contracts"
CHECKPOINT_PATH = "/opt/spark/data/checkpoints/silver_contracts"

# Persistencia Delta-Lake

In [7]:
# Rutas para el almacenamiento Delta Lake

# 4. Tarea: Guardar los datos limpios en formato Delta Lake
query = (df_silver.writeStream \
    .format("delta") \
    .outputMode("append") # A√±adir nuevos registros
    .option("checkpointLocation", CHECKPOINT_PATH) # Obligatorio para Spark Streaming
    .option("path", DELTA_LAKE_PATH) 
    .trigger(processingTime='10 seconds') # Procesa nuevos datos cada 10 segundos
    .start()
)

print(f"Escritura del Stream a Delta Lake iniciada. Estado del Query ID: {query.id}")
print("El Job est√° corriendo. Presiona el bot√≥n de 'Stop' o interrupci√≥n del kernel en Jupyter para detenerlo.")

                                                                                

Escritura del Stream a Delta Lake iniciada. Estado del Query ID: b89dcbd6-c9ec-4512-9d9f-a40dcb5eef3c
El Job est√° corriendo. Presiona el bot√≥n de 'Stop' o interrupci√≥n del kernel en Jupyter para detenerlo.


25/12/11 01:33:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


# FASE 3

## Embeddings BERT

In [8]:
# EJECUTA ESTO PRIMERO para limpiar la tabla anterior
import shutil
import os

GOLD_DELTA_PATH = "/opt/spark/data/delta/gold_contracts"
GOLD_CHECKPOINT_PATH = "/opt/spark/data/checkpoints/gold_contracts"

# Detener el query anterior si est√° corriendo
try:
    spark.streams.get("df023a71-e21f-4108-ad66-dada639a0ce4").stop()
except:
    pass

# Eliminar los datos y checkpoints anteriores
if os.path.exists(GOLD_DELTA_PATH):
    shutil.rmtree(GOLD_DELTA_PATH)
    print(f"‚úÖ Eliminado {GOLD_DELTA_PATH}")

if os.path.exists(GOLD_CHECKPOINT_PATH):
    shutil.rmtree(GOLD_CHECKPOINT_PATH)
    print(f"‚úÖ Eliminado {GOLD_CHECKPOINT_PATH}")

print("üîÑ Listo para reiniciar el proceso GOLD con el schema correcto.")

‚úÖ Eliminado /opt/spark/data/delta/gold_contracts
‚úÖ Eliminado /opt/spark/data/checkpoints/gold_contracts
üîÑ Listo para reiniciar el proceso GOLD con el schema correcto.


In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType
import sparknlp 
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import BertSentenceEmbeddings
from pyspark.ml import Pipeline

# --- PIPELINE SIMPLIFICADO (SIN FINISHER) ---

document_assembler = DocumentAssembler() \
    .setInputCol("objeto_contrato") \
    .setOutputCol("document")

bert_embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("bert_vector") \
    .setMaxSentenceLength(128)

nlp_pipeline = Pipeline(stages=[
    document_assembler,
    bert_embeddings
])

print("‚úÖ Pipeline definido.")

# --- PRE-ENTRENAMIENTO ---
print("‚è≥ Cargando modelo BERT...")
SILVER_DELTA_PATH = "/opt/spark/data/delta/silver_contracts"
df_sample = spark.read.format("delta").load(SILVER_DELTA_PATH).limit(1)
pipeline_model = nlp_pipeline.fit(df_sample)
print("‚úÖ Modelo BERT cargado.")

# ==============================================================================
# UDF para extraer embeddings
# ==============================================================================

def extract_embeddings(bert_vector_column):
    if bert_vector_column and len(bert_vector_column) > 0:
        first_annotation = bert_vector_column[0]
        if first_annotation and hasattr(first_annotation, 'embeddings'):
            return first_annotation.embeddings
    return None

extract_embeddings_udf = udf(extract_embeddings, ArrayType(FloatType()))

# ==============================================================================
# PROCESAMIENTO STREAMING
# ==============================================================================

GOLD_DELTA_PATH = "/opt/spark/data/delta/gold_contracts"
GOLD_CHECKPOINT_PATH = "/opt/spark/data/checkpoints/gold_contracts"

df_silver_stream = spark.readStream \
    .format("delta") \
    .load(SILVER_DELTA_PATH)

df_gold = pipeline_model.transform(df_silver_stream)

df_gold_final = df_gold.select(
    col("id_contrato"),
    col("entidad"),
    col("departamento"),
    col("valor_contrato"),
    col("duracion_dias"),
    col("codigo_unspsc"),
    extract_embeddings_udf(col("bert_vector")).alias("objeto_embedding"),
    col("kafka_ingestion_time"),
    col("processing_time")
)

query_gold = (df_gold_final.writeStream \
    .format("delta") \
    .outputMode("append") 
    .option("checkpointLocation", GOLD_CHECKPOINT_PATH)
    .option("path", GOLD_DELTA_PATH) 
    .trigger(processingTime='30 seconds')
    .start())

print(f"‚úÖ Proceso GOLD iniciado. Escribiendo en {GOLD_DELTA_PATH}")
print(f"Query ID: {query_gold.id}")

sent_small_bert_L2_128 download started this may take some time.


25/12/11 01:33:58 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/12/11 01:33:59 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
25/12/11 01:34:00 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 3:>                                                        (0 + 12) / 50]

Approximate size to download 16.1 MB
[ | ]



sent_small_bert_L2_128 download started this may take some time.
Approximate size to download 16.1 MB


                                                                                

Download done! Loading the resource.


[Stage 5:>                                                        (0 + 12) / 12]

[ / ]

                                                                                

[ ‚Äî ]

2025-12-11 01:34:13.380357: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
                                                                                

[OK!]
‚úÖ Pipeline definido.
‚è≥ Cargando modelo BERT...


25/12/11 01:34:17 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 19133 milliseconds


‚úÖ Modelo BERT cargado.
‚úÖ Proceso GOLD iniciado. Escribiendo en /opt/spark/data/delta/gold_contracts
Query ID: aeb15bba-89ce-4ab7-bfa6-7cb2c90be046


25/12/11 01:34:17 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


### Verificaci√≥n GOLD

In [12]:
from pyspark.sql.functions import slice, size

# Cargar GOLD
df_gold_check = spark.read.format("delta").load(GOLD_DELTA_PATH)

print("=== SCHEMA GOLD ===")
df_gold_check.printSchema()

print("\n=== SAMPLE ROWS (10) ===")
df_gold_check.show(10, truncate=False)

print("\n=== VECTOR SIZE (primeras 10 filas) ")
df_gold_check.select(
    "id_contrato",
    size("objeto_embedding").alias("vector_size")
).show(10, truncate=False)

print("\n=== PRIMEROS VALORES DEL VECTOR")
df_gold_check.select(
    "id_contrato",
    slice("objeto_embedding", 1, 10).alias("primeros_valores")
).show(10, truncate=False)


AnalysisException: [DELTA_TABLE_NOT_FOUND] Delta table `/opt/spark/data/delta/gold_contracts` doesn't exist.

                                                                                

## Variables Categ√≥ricas

In [13]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType
from pyspark.ml.linalg import DenseVector, VectorUDT

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# 1. Leer GOLD
GOLD_DELTA_PATH = "/opt/spark/data/delta/gold_contracts"
df_gold_base = spark.read.format("delta").load(GOLD_DELTA_PATH)

# =====================================================
# A. Convertir array<float> ‚Üí DenseVector (obligatorio)
# =====================================================

def to_dense_vector(arr):
    if arr is None:
        return None
    return DenseVector(arr)

to_dense_udf = udf(to_dense_vector, VectorUDT())

df_gold_vec = df_gold_base.withColumn(
    "objeto_embedding_vec",
    to_dense_udf(col("objeto_embedding"))
)

# =====================================================
# B. StringIndexer
# =====================================================

indexer_entidad = StringIndexer(
    inputCol="entidad",
    outputCol="entidad_indexed",
    handleInvalid="keep"
)

indexer_unspsc = StringIndexer(
    inputCol="codigo_unspsc",
    outputCol="unspsc_indexed",
    handleInvalid="keep"
)

# =====================================================
# C. OneHotEncoder
# =====================================================

encoder_entidad = OneHotEncoder(
    inputCol="entidad_indexed",
    outputCol="entidad_encoded"
)

encoder_unspsc = OneHotEncoder(
    inputCol="unspsc_indexed",
    outputCol="unspsc_encoded"
)

# =====================================================
# D. VectorAssembler (incluye embeddings + num√©ricas)
# =====================================================

feature_cols = [
    "objeto_embedding_vec",   # ‚Üê EL BUENO
    "entidad_encoded",
    "unspsc_encoded",
    "valor_contrato",
    "duracion_dias"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_final"
)

# =====================================================
# E. Pipeline final
# =====================================================

ml_pipeline = Pipeline(stages=[
    indexer_entidad,
    encoder_entidad,
    indexer_unspsc,
    encoder_unspsc,
    assembler
])

print("‚úÖ Pipeline ML completo definido correctamente.")


‚úÖ Pipeline ML completo definido correctamente.


In [14]:
ml_pipeline_model = ml_pipeline.fit(df_gold_vec)
df_ml_ready = ml_pipeline_model.transform(df_gold_vec)


# ----------------------------------------------------
# VERIFICACI√ìN DE RESULTADOS
# ----------------------------------------------------
print("\n--- ESQUEMA FINAL CON LAS NUEVAS FEATURES ---")
# Busca las columnas 'entidad_indexed', 'entidad_encoded' y 'features_final'
df_ml_ready.printSchema()

df_ml_ready.select(
    "id_contrato",
    "features_final"
).show(5, truncate=False)


                                                                                


--- ESQUEMA FINAL CON LAS NUEVAS FEATURES ---
root
 |-- id_contrato: string (nullable = true)
 |-- entidad: string (nullable = true)
 |-- departamento: string (nullable = true)
 |-- valor_contrato: double (nullable = true)
 |-- duracion_dias: long (nullable = true)
 |-- codigo_unspsc: string (nullable = true)
 |-- objeto_embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- kafka_ingestion_time: timestamp (nullable = true)
 |-- processing_time: timestamp (nullable = true)
 |-- objeto_embedding_vec: vector (nullable = true)
 |-- entidad_indexed: double (nullable = false)
 |-- entidad_encoded: vector (nullable = true)
 |-- unspsc_indexed: double (nullable = false)
 |-- unspsc_encoded: vector (nullable = true)
 |-- features_final: vector (nullable = true)



[Stage 338:>                                                        (0 + 1) / 1]

+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [18]:
from pyspark.sql.functions import col, when, lit
from pyspark.ml.functions import vector_to_array
from pyspark.sql import functions as F

# ================================
# 1. REPARAR NULOS EN NUM√âRICAS
# ================================
# calcular medianas
median_vals = {}
for c in ["valor_contrato", "duracion_dias"]:
    median_vals[c] = df_ml_ready.approxQuantile(c, [0.5], 0.001)[0]

df_clean = df_ml_ready.fillna(median_vals)

# ================================
# 2. REPARAR NULOS EN EMBEDDINGS
# ================================
# convertir embedding a array para revisar
df_clean = df_clean.withColumn("embed_arr_tmp", vector_to_array(col("objeto_embedding_vec")))

# detectar tama√±o del embedding
first = df_clean.select("embed_arr_tmp").filter(col("embed_arr_tmp").isNotNull()).limit(1).collect()
if len(first) == 0:
    raise ValueError("No hay embeddings v√°lidos para detectar longitud")

embed_dim = len(first[0]["embed_arr_tmp"])

# vector de ceros
zeros_array = [0.0] * embed_dim

df_clean = df_clean.withColumn(
    "embed_arr",
    when(col("embed_arr_tmp").isNull(), F.array([lit(0.0) for _ in range(embed_dim)]))
    .otherwise(col("embed_arr_tmp"))
)

# convertir a vector denso otra vez
from pyspark.ml.linalg import DenseVector, VectorUDT
from pyspark.sql.functions import udf

def arr_to_vec(arr):
    return DenseVector(arr)

arr_to_vec_udf = udf(arr_to_vec, VectorUDT())

df_clean = df_clean.withColumn("objeto_embedding_vec_fixed", arr_to_vec_udf(col("embed_arr")))

# eliminamos columnas temporales
df_clean = df_clean.drop("embed_arr_tmp", "objeto_embedding_vec")

df_clean = df_clean.withColumnRenamed("objeto_embedding_vec_fixed", "objeto_embedding_vec")

print("‚úî Embeddings nulos reemplazados por vector de ceros")
print("‚úî Columnas num√©ricas rellenadas con mediana")


                                                                                

‚úî Embeddings nulos reemplazados por vector de ceros
‚úî Columnas num√©ricas rellenadas con mediana


In [21]:
from pyspark.sql.functions import corr

numeric_vars = [
    "duracion_dias",
    "entidad_indexed",
    "unspsc_indexed"
]

print("=== CORRELACI√ìN PEARSON CONTRA valor_contrato ===")

for colname in numeric_vars:
    pear = df_clean.stat.corr(colname, "valor_contrato")   # ‚Üê sin method
    print(f"\nVariable: {colname}")
    print(f"  Pearson : {pear}")


=== CORRELACI√ìN PEARSON CONTRA valor_contrato ===

Variable: duracion_dias
  Pearson : nan

Variable: entidad_indexed
  Pearson : 0.043608346445600195

Variable: unspsc_indexed
  Pearson : -0.003293496670342968


In [22]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def spearman_corr(df, col1, col2):
    w = Window.orderBy(col1)
    df_ranked = df.withColumn("r1", F.rank().over(w))
    
    w2 = Window.orderBy(col2)
    df_ranked = df_ranked.withColumn("r2", F.rank().over(w2))
    
    # correlaci√≥n pearson de los rangos
    return df_ranked.stat.corr("r1", "r2")

print("\n=== SPEARMAN REAL ===")
for colname in numeric_vars:
    spear = spearman_corr(df_clean.select(colname, "valor_contrato"), colname, "valor_contrato")
    print(f"{colname} ‚Üí Spearman = {spear}")



=== SPEARMAN REAL ===


25/12/11 01:46:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 0

duracion_dias ‚Üí Spearman = nan


25/12/11 01:46:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:46:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 0

entidad_indexed ‚Üí Spearman = -0.05191239039789368
unspsc_indexed ‚Üí Spearman = 0.1772676558099022


25/12/11 01:47:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:47:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:47:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:47:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:47:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 01:47:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/11 0

In [23]:
# ======================================================
# 3. VECTOR ASSEMBLER PARA PCA
# ======================================================

assembler_pca = VectorAssembler(
    inputCols=[
        "valor_contrato",
        "duracion_dias",
        "entidad_encoded",
        "unspsc_encoded",
        "objeto_embedding_vec"
    ],
    outputCol="pca_input"
)

df_pca_ready = assembler_pca.transform(df_clean)
print("‚úî VectorAssembler para PCA generado")


# ======================================================
# 4. PCA
# ======================================================

pca = PCA(
    k=50,   # n√∫mero de componentes
    inputCol="pca_input",
    outputCol="pca_features"
)

model_pca = pca.fit(df_pca_ready)
df_pca = model_pca.transform(df_pca_ready)

print("‚úî PCA aplicado correctamente")


# ======================================================
# 5. VARIANZA EXPLICADA
# ======================================================

explained = model_pca.explainedVariance.toArray()

print("\n===== VARIANZA EXPLICADA POR CADA COMPONENTE =====")
for i, v in enumerate(explained):
    print(f"PC{i+1}: {v:.4f}")

print("\nVarianza total explicada:", sum(explained))


# ======================================================
# 6. DATASET FINAL LISTO PARA MODELAR
# ======================================================

df_final = df_pca.select(
    "id_contrato",
    "valor_contrato",
    "pca_features"
)

print("\n===== PREVIEW FINAL =====")
df_final.show(5)


                                                                                

‚úî VectorAssembler para PCA generado


                                                                                

‚úî PCA aplicado correctamente

===== VARIANZA EXPLICADA POR CADA COMPONENTE =====
PC1: 1.0000
PC2: 0.0000
PC3: 0.0000
PC4: 0.0000
PC5: 0.0000
PC6: 0.0000
PC7: 0.0000
PC8: 0.0000
PC9: 0.0000
PC10: 0.0000
PC11: 0.0000
PC12: 0.0000
PC13: 0.0000
PC14: 0.0000
PC15: 0.0000
PC16: 0.0000
PC17: 0.0000
PC18: 0.0000
PC19: 0.0000
PC20: 0.0000
PC21: 0.0000
PC22: 0.0000
PC23: 0.0000
PC24: 0.0000
PC25: 0.0000
PC26: 0.0000
PC27: 0.0000
PC28: 0.0000
PC29: 0.0000
PC30: 0.0000
PC31: 0.0000
PC32: 0.0000
PC33: 0.0000
PC34: 0.0000
PC35: 0.0000
PC36: 0.0000
PC37: 0.0000
PC38: 0.0000
PC39: 0.0000
PC40: 0.0000
PC41: 0.0000
PC42: 0.0000
PC43: 0.0000
PC44: 0.0000
PC45: 0.0000
PC46: 0.0000
PC47: 0.0000
PC48: 0.0000
PC49: 0.0000
PC50: 0.0000

Varianza total explicada: 0.9999999999997213

===== PREVIEW FINAL =====
+------------------+--------------+--------------------+
|       id_contrato|valor_contrato|        pca_features|
+------------------+--------------+--------------------+
|CO1.PCCNTR.4107445|     9772584

In [24]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# ============================================================
# 1. SPLIT Train/Test
# ============================================================

train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

print("‚úî Split realizado:")
print(f"Train: {train_df.count()} filas")
print(f"Test:  {test_df.count()} filas")


# ============================================================
# 2. MODELO ‚Äî GBTRegressor
# ============================================================

gbt = GBTRegressor(
    featuresCol="pca_features",
    labelCol="valor_contrato",
    maxDepth=8,
    maxIter=100,
    stepSize=0.1
)

pipeline = Pipeline(stages=[gbt])

# Entrenar
model = pipeline.fit(train_df)

print("‚úî Modelo GBT entrenado correctamente")


# ============================================================
# 3. EVALUACI√ìN
# ============================================================

predictions = model.transform(test_df)

# Evaluadores
evaluator_rmse = RegressionEvaluator(
    labelCol="valor_contrato",
    predictionCol="prediction",
    metricName="rmse"
)

evaluator_r2 = RegressionEvaluator(
    labelCol="valor_contrato",
    predictionCol="prediction",
    metricName="r2"
)

rmse = evaluator_rmse.evaluate(predictions)
r2   = evaluator_r2.evaluate(predictions)

print("\n===== RESULTADOS =====")
print(f"RMSE: {rmse}")
print(f"R2  : {r2}")


# ============================================================
# 4. GUARDAR MODELO (persistencia)
# ============================================================

MODEL_PATH = "/opt/spark/models/contratos_gbt_pca"

model.write().overwrite().save(MODEL_PATH)

print(f"‚úî Modelo guardado en: {MODEL_PATH}")


‚úî Split realizado:


                                                                                

Train: 4221 filas


                                                                                

Test:  1014 filas


25/12/11 03:10:36 WARN DAGScheduler: Broadcasting large task binary with size 1003.4 KiB
25/12/11 03:10:38 WARN DAGScheduler: Broadcasting large task binary with size 1004.2 KiB
25/12/11 03:10:39 WARN DAGScheduler: Broadcasting large task binary with size 1004.9 KiB
25/12/11 03:10:39 WARN DAGScheduler: Broadcasting large task binary with size 1005.6 KiB
25/12/11 03:10:39 WARN DAGScheduler: Broadcasting large task binary with size 1007.3 KiB
25/12/11 03:10:39 WARN DAGScheduler: Broadcasting large task binary with size 1010.1 KiB
25/12/11 03:10:39 WARN DAGScheduler: Broadcasting large task binary with size 1014.6 KiB
25/12/11 03:10:39 WARN DAGScheduler: Broadcasting large task binary with size 1022.9 KiB
25/12/11 03:10:40 WARN DAGScheduler: Broadcasting large task binary with size 1028.7 KiB
25/12/11 03:10:40 WARN DAGScheduler: Broadcasting large task binary with size 1029.1 KiB
25/12/11 03:10:40 WARN DAGScheduler: Broadcasting large task binary with size 1029.9 KiB
25/12/11 03:10:40 WAR

‚úî Modelo GBT entrenado correctamente


                                                                                


===== RESULTADOS =====
RMSE: 37538846280.08359
R2  : 0.9808515738734394


[Stage 2058:>                                                       (0 + 1) / 1]

‚úî Modelo guardado en: /opt/spark/models/contratos_gbt_pca


                                                                                

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, abs

# =====================================================
# Evaluaci√≥n
# =====================================================
preds = model.transform(test_df)

# RMSE
rmse = RegressionEvaluator(
    labelCol="valor_contrato",
    predictionCol="prediction",
    metricName="rmse"
).evaluate(preds)

# MAE
mae = RegressionEvaluator(
    labelCol="valor_contrato",
    predictionCol="prediction",
    metricName="mae"
).evaluate(preds)

# R2
r2 = RegressionEvaluator(
    labelCol="valor_contrato",
    predictionCol="prediction",
    metricName="r2"
).evaluate(preds)

# MAPE (no existe en Spark ‚Üí lo calculamos manual)
preds_mape = preds.withColumn(
    "ape",
    abs((col("valor_contrato") - col("prediction")) / col("valor_contrato"))
)

mape = preds_mape.selectExpr("avg(ape)").first()[0]

print("\n===== M√âTRICAS DE REGRESI√ìN =====")
print(f"RMSE : {rmse}")
print(f"MAE  : {mae}")
print(f"R2   : {r2}")
print(f"MAPE : {mape}")





===== M√âTRICAS DE REGRESI√ìN =====
RMSE : 37538846280.08359
MAE  : 1823362335.9936445
R2   : 0.9808515738734394
MAPE : 2.470132240666463


                                                                                