In [1]:
# ============================================================================
# FASE 3: FEATURE ENGINEERING Y EMBEDDINGS
# ============================================================================
# Objetivo: Transformar texto y variables categÃ³ricas en features numÃ©ricas
# para el modelo de ML usando Word2Vec, StringIndexer, OneHotEncoder y PCA

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, Word2Vec, 
    StringIndexer, OneHotEncoder, VectorAssembler,
    StandardScaler, PCA
)
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
import numpy as np

In [4]:
# ============================================================================
# 0. CONFIGURACIÃ“N INICIAL
# ============================================================================

print("="*80)
print("FASE 3: FEATURE ENGINEERING Y EMBEDDINGS")
print("="*80 + "\n")

# Verificar que Spark estÃ© activo o crearlo
try:
    spark.version
    print(f"âœ… Usando SparkSession existente (Spark {spark.version})")
except:
    spark = (
    SparkSession.builder
    .appName("Feature_Engineering")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "io.delta:delta-spark_2.12:3.0.0")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

print(f"âœ… SparkSession creada con Delta Lake (Spark {spark.version})")
spark.sparkContext.setLogLevel("ERROR")



FASE 3: FEATURE ENGINEERING Y EMBEDDINGS

âœ… Usando SparkSession existente (Spark 3.5.1)
âœ… SparkSession creada con Delta Lake (Spark 3.5.1)


In [5]:
# ============================================================================
# 1. CARGAR DATOS DESDE SILVER
# ============================================================================

print("\n" + "="*80)
print("PASO 1: CARGAR DATOS DESDE SILVER")
print("="*80 + "\n")

SILVER_PATH = "/app/notebooks/delta_lake/silver_contracts"

print(f"ðŸ“– Cargando datos desde: {SILVER_PATH}")

df_silver = spark.read.format("delta").load(SILVER_PATH)
df_silver = df_silver.cache()

total_records = df_silver.count()
print(f"âœ… Registros cargados: {total_records:,}\n")

print("ðŸ“‹ Esquema de datos:")
df_silver.printSchema()

print("\nðŸ“Š Muestra de datos:")
df_silver.select(
    "id_contrato", "objeto_contrato", "entidad", 
    "codigo_unspsc", "valor_contrato", "duracion_dias"
).show(3, truncate=False)





PASO 1: CARGAR DATOS DESDE SILVER

ðŸ“– Cargando datos desde: /app/notebooks/delta_lake/silver_contracts


Py4JJavaError: An error occurred while calling o34.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:724)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:593)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:526)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 15 more


In [None]:
# ============================================================================
# 2. LIMPIEZA Y PREPARACIÃ“N DE TEXTO
# ============================================================================

print("\n" + "="*80)
print("PASO 2: LIMPIEZA Y PREPARACIÃ“N DE TEXTO")
print("="*80 + "\n")

print("ðŸ§¹ Limpiando columna 'objeto_contrato'...")

# Limpiar y normalizar texto
df_prepared = df_silver.withColumn(
    "objeto_limpio",
    lower(
        regexp_replace(
            regexp_replace(col("objeto_contrato"), "[^a-zA-ZÃ¡Ã©Ã­Ã³ÃºÃ±Ã‘0-9\\s]", " "),
            "\\s+", " "
        )
    )
)

# Filtrar textos muy cortos (menos de 10 caracteres)
df_prepared = df_prepared.filter(length(col("objeto_limpio")) >= 10)

print(f"âœ… Registros despuÃ©s de limpieza: {df_prepared.count():,}")

print("\nðŸ“‹ Ejemplo de texto limpio:")
df_prepared.select("objeto_contrato", "objeto_limpio").show(2, truncate=80)
