In [1]:
# %%
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, PCA, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import col

# %%
spark = SparkSession.builder \
    .appName("SECOP_Transformaciones") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

print(f"Spark conectado a: {spark.sparkContext.master}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/08 20:26:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark conectado a: spark://spark-master:7077


In [2]:
# Cargar datos transformados del notebook anterior
df = spark.read.parquet("/opt/spark-data/raw/secop_features.parquet")
print(f"Registros: {df.count():,}")
print(f"Columnas: {len(df.columns)}")

[Stage 1:>                                                          (0 + 2) / 2]

Registros: 100,000
Columnas: 17


                                                                                

### **Analizar por que normalizar (examinar escalas)**

 - Tomar una muestra de los vectores
 - Inspeccionar los primeros valores del vector

In [3]:
sample = df.select("features_raw").limit(5).collect()
for i, row in enumerate(sample, start=1):
    features_array = row["features_raw"].toArray()
    print(f"\nRegistro {i} - Primeros 10 valores del vector:")
    print(features_array[:10])
    


Registro 1 - Primeros 10 valores del vector:
[4.5588e+06 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00]

Registro 2 - Primeros 10 valores del vector:
[4.08e+06 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00]

Registro 3 - Primeros 10 valores del vector:
[4.59e+06 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00]

Registro 4 - Primeros 10 valores del vector:
[9.792e+06 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00]

Registro 5 - Primeros 10 valores del vector:
[4.08e+06 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00]


### **Comparar antes y despues de StandardScaler**

In [4]:
scaler = StandardScaler(inputCol="features_raw",
                       outputCol = "features_scaled",
                       withMean=False,
                       withStd=True)

In [5]:
print("Aplicando StandardScaler")
scaler_model = scaler.fit(df)
df_scaled=scaler_model.transform(df)
print("✓ StandardScaler aplicado a toda la base")
print("Columnas disponibles:")
print(df_scaled.columns)
df_scaled.show()

Aplicando StandardScaler
✓ StandardScaler aplicado a toda la base
Columnas disponibles:
['referencia_del_contrato', 'valor_del_contrato', 'valor_del_contrato_num', 'departamento', 'tipo_de_contrato', 'fecha_de_firma', 'fecha_de_firma_ts', 'duraci_n_del_contrato', 'proveedor_adjudicado', 'estado_contrato', 'departamento_idx', 'tipo_de_contrato_idx', 'estado_contrato_idx', 'departamento_vec', 'tipo_de_contrato_vec', 'estado_contrato_vec', 'features_raw', 'features_scaled']


[Stage 8:>                                                          (0 + 1) / 1]

+-----------------------+------------------+----------------------+------------+--------------------+--------------------+-------------------+---------------------+--------------------+---------------+----------------+--------------------+-------------------+----------------+--------------------+-------------------+--------------------+--------------------+
|referencia_del_contrato|valor_del_contrato|valor_del_contrato_num|departamento|    tipo_de_contrato|      fecha_de_firma|  fecha_de_firma_ts|duraci_n_del_contrato|proveedor_adjudicado|estado_contrato|departamento_idx|tipo_de_contrato_idx|estado_contrato_idx|departamento_vec|tipo_de_contrato_vec|estado_contrato_vec|        features_raw|     features_scaled|
+-----------------------+------------------+----------------------+------------+--------------------+--------------------+-------------------+---------------------+--------------------+---------------+----------------+--------------------+-------------------+----------------+----

                                                                                

In [6]:
import numpy as np
import pandas as pd

sample_df = (
    df_scaled
    .select("features_raw", "features_scaled")
    .sample(fraction=0.01, seed=42)
    .limit(1000)
    .toPandas()
)

raw_matrix = np.array([
    row['features_raw'].toArray() for _, row in sample_df.iterrows()
])

scaled_matrix = np.array([
    row['features_scaled'].toArray() for _, row in sample_df.iterrows()
])


print("ANTES (features_raw):")
print(f"Min:  {raw_matrix.min():.2f}")
print(f"Max:  {raw_matrix.max():.2f}")
print(f"Mean: {raw_matrix.mean():.2f}")
print(f"Std:  {raw_matrix.std():.2f}")
print("\nDESPUÉS (features_scaled):")
print(f"Min:  {scaled_matrix.min():.2f}")
print(f"Max:  {scaled_matrix.max():.2f}")
print(f"Mean: {scaled_matrix.mean():.2f}")
print(f"Std:  {scaled_matrix.std():.2f}")


ANTES (features_raw):
Min:  0.00
Max:  75952011382.00
Mean: 4756215.14
Std:  368580743.71

DESPUÉS (features_scaled):
Min:  0.00
Max:  69.01
Mean: 0.18
Std:  1.12


In [7]:
df_scaled.select("features_scaled").first()[0]

SparseVector(62, {0: 0.001, 5: 4.5961, 35: 2.2189, 57: 3.9607})

### **Configurar PCA y elegir numero de componentes**

In [8]:
sample_vec = df_scaled.select("features_scaled").first()[0]
num_features = len(sample_vec)

print(f"Número total de features: {num_features}")


Número total de features: 62


In [9]:
k = min(30, num_features)
print(f"Usando k = {k} componentes para PCA")


pca= PCA( k=k,
         inputCol="features_scaled",
         outputCol="features_pca")
print("✓ PCA configurado correctamente")

Usando k = 30 componentes para PCA
✓ PCA configurado correctamente


26/02/08 20:26:43 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [10]:
print("Aplicando PCA")

pca_model = pca.fit(df_scaled)
df_pca = pca_model.transform(df_scaled)

print(f"Dimensión original: {num_features}")
print(f"Dimensión reducida: {k}")

df_pca.select("features_pca").first()[0]

Aplicando PCA


26/02/08 20:26:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Dimensión original: 62
Dimensión reducida: 30


DenseVector([0.4892, -1.1825, 1.2005, 1.7628, 0.3237, -0.0634, -0.4746, 1.0248, 1.6538, 0.553, 0.4223, 0.8691, 0.1912, 1.1301, 0.8073, -0.3011, 2.3324, -1.0573, -0.8781, -0.792, 0.3256, 0.2192, 0.9891, 0.381, 0.2194, -0.3203, 0.0276, -0.0529, 0.2866, 0.1348])

### **Analizar varianza explicada por componente**

In [11]:
explained_variance = pca_model.explainedVariance

print("\nVARIANZA EXPLICADA POR COMPONENTE")
cumulative_variance = 0

for i, var in enumerate(explained_variance):
    cumulative_variance += var
    print(
        f"Componente {i+1}: "
        f"{var*100:.2f}% | "
        f"Acumulada: {cumulative_variance*100:.2f}%"
    )


VARIANZA EXPLICADA POR COMPONENTE
Componente 1: 3.75% | Acumulada: 3.75%
Componente 2: 2.67% | Acumulada: 6.42%
Componente 3: 2.27% | Acumulada: 8.69%
Componente 4: 2.14% | Acumulada: 10.83%
Componente 5: 2.06% | Acumulada: 12.89%
Componente 6: 1.96% | Acumulada: 14.85%
Componente 7: 1.93% | Acumulada: 16.78%
Componente 8: 1.85% | Acumulada: 18.64%
Componente 9: 1.84% | Acumulada: 20.47%
Componente 10: 1.80% | Acumulada: 22.27%
Componente 11: 1.74% | Acumulada: 24.00%
Componente 12: 1.72% | Acumulada: 25.73%
Componente 13: 1.71% | Acumulada: 27.44%
Componente 14: 1.71% | Acumulada: 29.14%
Componente 15: 1.70% | Acumulada: 30.84%
Componente 16: 1.69% | Acumulada: 32.52%
Componente 17: 1.68% | Acumulada: 34.21%
Componente 18: 1.68% | Acumulada: 35.88%
Componente 19: 1.67% | Acumulada: 37.55%
Componente 20: 1.66% | Acumulada: 39.22%
Componente 21: 1.66% | Acumulada: 40.88%
Componente 22: 1.66% | Acumulada: 42.53%
Componente 23: 1.66% | Acumulada: 44.19%
Componente 24: 1.65% | Acumulada: 

### **Integrar todo en un Pipeline completo**

In [12]:
pipeline_transform = Pipeline(stages=[scaler, pca])
pipeline_transform_model=pipeline_transform.fit(df)
df_final=pipeline_transform_model.transform(df)

df_ml=df_final.select("features_pca","valor_del_contrato_num")



### **Guardar rutas**

In [13]:
pipeline_path= "/opt/spark-data/raw/transformation_pipeline" # pipeline completo
output_path= "/opt/spark-data/raw/secop_ml_ready.parquet"
df_ml.write.mode("overwrite").parquet(output_path)
pipeline_transform_model.save(pipeline_path)

                                                                                

In [14]:
spark.stop()