In [1]:
# %%
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, PCA, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import col
from pyspark.sql import functions as F

# %%
spark = SparkSession.builder \
    .appName("SECOP_Transformaciones") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

print(f"Spark conectado a: {spark.sparkContext.master}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/13 15:09:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark conectado a: spark://spark-master:7077


In [2]:
# Cargar datos transformados del notebook anterior
df = spark.read.parquet("/opt/spark-data/raw/secop_features_1.parquet")
print(f"Registros: {df.count():,}")
print(f"Columnas: {len(df.columns)}")

26/02/13 15:09:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
[Stage 1:>                                                          (0 + 2) / 2]

Registros: 100,000
Columnas: 18


                                                                                

In [5]:
df.printSchema()

root
 |-- referencia_del_contrato: string (nullable = true)
 |-- valor_del_contrato: string (nullable = true)
 |-- valor_del_contrato_num: double (nullable = true)
 |-- departamento: string (nullable = true)
 |-- tipo_de_contrato: string (nullable = true)
 |-- fecha_de_firma: string (nullable = true)
 |-- fecha_de_firma_ts: timestamp (nullable = true)
 |-- duraci_n_del_contrato: string (nullable = true)
 |-- proveedor_adjudicado: string (nullable = true)
 |-- estado_contrato: string (nullable = true)
 |-- departamento_idx: double (nullable = true)
 |-- tipo_de_contrato_idx: double (nullable = true)
 |-- estado_contrato_idx: double (nullable = true)
 |-- departamento_vec: vector (nullable = true)
 |-- tipo_de_contrato_vec: vector (nullable = true)
 |-- estado_contrato_vec: vector (nullable = true)
 |-- features_raw: vector (nullable = true)
 |-- valor_del_contrato_log: double (nullable = true)



### **Analizar por que normalizar (examinar escalas)**

 - Tomar una muestra de los vectores
 - Inspeccionar los primeros valores del vector

In [3]:
sample = df.select("features_raw").limit(5).collect()
for i, row in enumerate(sample, start=1):
    features_array = row["features_raw"].toArray()
    print(f"\nRegistro {i} - Primeros 10 valores del vector:")
    print(features_array[:10])
    


Registro 1 - Primeros 10 valores del vector:
[15.33257021  0.          0.          0.          0.          1.
  0.          0.          0.          0.        ]

Registro 2 - Primeros 10 valores del vector:
[15.22160779  0.          0.          0.          0.          1.
  0.          0.          0.          0.        ]

Registro 3 - Primeros 10 valores del vector:
[15.3393908  0.         0.         0.         0.         1.
  0.         0.         0.         0.       ]

Registro 4 - Primeros 10 valores del vector:
[16.09707639  0.          0.          0.          0.          1.
  0.          0.          0.          0.        ]

Registro 5 - Primeros 10 valores del vector:
[15.22160779  0.          0.          0.          0.          1.
  0.          0.          0.          0.        ]


### **Comparar antes y despues de StandardScaler**

In [5]:
scaler = StandardScaler(inputCol="features_raw",
                       outputCol = "features_scaled",
                       withMean=False,
                       withStd=True)

In [6]:
print("Aplicando StandardScaler")
scaler_model = scaler.fit(df)
df_scaled=scaler_model.transform(df)
print("✓ StandardScaler aplicado a toda la base")
print("Columnas disponibles:")
print(df_scaled.columns)
df_scaled.show()

Aplicando StandardScaler


                                                                                

✓ StandardScaler aplicado a toda la base
Columnas disponibles:
['referencia_del_contrato', 'valor_del_contrato', 'valor_del_contrato_num', 'departamento', 'tipo_de_contrato', 'fecha_de_firma', 'fecha_de_firma_ts', 'duraci_n_del_contrato', 'proveedor_adjudicado', 'estado_contrato', 'valor_del_contrato_log', 'departamento_idx', 'tipo_de_contrato_idx', 'estado_contrato_idx', 'departamento_vec', 'tipo_de_contrato_vec', 'estado_contrato_vec', 'features_raw', 'features_scaled']


[Stage 8:>                                                          (0 + 1) / 1]

+-----------------------+------------------+----------------------+------------+--------------------+--------------------+-------------------+---------------------+--------------------+---------------+----------------------+----------------+--------------------+-------------------+----------------+--------------------+-------------------+--------------------+--------------------+
|referencia_del_contrato|valor_del_contrato|valor_del_contrato_num|departamento|    tipo_de_contrato|      fecha_de_firma|  fecha_de_firma_ts|duraci_n_del_contrato|proveedor_adjudicado|estado_contrato|valor_del_contrato_log|departamento_idx|tipo_de_contrato_idx|estado_contrato_idx|departamento_vec|tipo_de_contrato_vec|estado_contrato_vec|        features_raw|     features_scaled|
+-----------------------+------------------+----------------------+------------+--------------------+--------------------+-------------------+---------------------+--------------------+---------------+----------------------+----------

                                                                                

In [7]:
import numpy as np
import pandas as pd

sample_df = (
    df_scaled
    .select("features_raw", "features_scaled")
    .sample(fraction=0.01, seed=42)
    .limit(1000)
    .toPandas()
)

raw_matrix = np.array([
    row['features_raw'].toArray() for _, row in sample_df.iterrows()
])

scaled_matrix = np.array([
    row['features_scaled'].toArray() for _, row in sample_df.iterrows()
])


print("ANTES (features_raw):")
print(f"Min:  {raw_matrix.min():.2f}")
print(f"Max:  {raw_matrix.max():.2f}")
print(f"Mean: {raw_matrix.mean():.2f}")
print(f"Std:  {raw_matrix.std():.2f}")
print("\nDESPUÉS (features_scaled):")
print(f"Min:  {scaled_matrix.min():.2f}")
print(f"Max:  {scaled_matrix.max():.2f}")
print(f"Mean: {scaled_matrix.mean():.2f}")
print(f"Std:  {scaled_matrix.std():.2f}")


ANTES (features_raw):
Min:  0.00
Max:  25.05
Mean: 0.31
Std:  2.08

DESPUÉS (features_scaled):
Min:  0.00
Max:  69.01
Mean: 0.28
Std:  1.36


In [8]:
df_scaled.select("features_scaled").first()[0]

                                                                                

SparseVector(62, {0: 5.9713, 5: 4.5961, 35: 2.2189, 57: 3.9607})

### **Configurar PCA y elegir numero de componentes**

In [9]:
sample_vec = df_scaled.select("features_scaled").first()[0]
num_features = len(sample_vec)

print(f"Número total de features: {num_features}")


Número total de features: 62


In [10]:
k = min(30, num_features)
print(f"Usando k = {k} componentes para PCA")


pca= PCA( k=k,
         inputCol="features_scaled",
         outputCol="features_pca")
print("✓ PCA configurado correctamente")

Usando k = 30 componentes para PCA
✓ PCA configurado correctamente


In [12]:
print("Aplicando PCA")

pca_model = pca.fit(df_scaled)
df_pca = pca_model.transform(df_scaled)

print(f"Dimensión original: {num_features}")
print(f"Dimensión reducida: {k}")

df_pca.select("features_pca").first()[0]

Aplicando PCA


26/02/13 15:13:26 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Dimensión original: 62
Dimensión reducida: 30


DenseVector([-1.9254, 0.9076, 1.9437, -1.3567, 1.789, -1.5397, 0.6821, 0.383, 2.7299, 0.4, 0.5641, -1.5713, 1.1841, -0.4824, 0.4183, 1.4407, 1.6349, -1.3505, 0.1382, 0.7225, -0.3971, -0.7564, 0.2081, 0.2441, -0.0689, 0.443, -0.2102, -0.5027, 0.3227, 0.0605])

### **Analizar varianza explicada por componente**

In [13]:
explained_variance = pca_model.explainedVariance

print("\nVARIANZA EXPLICADA POR COMPONENTE")
cumulative_variance = 0

for i, var in enumerate(explained_variance):
    cumulative_variance += var
    print(
        f"Componente {i+1}: "
        f"{var*100:.2f}% | "
        f"Acumulada: {cumulative_variance*100:.2f}%"
    )


VARIANZA EXPLICADA POR COMPONENTE
Componente 1: 3.89% | Acumulada: 3.89%
Componente 2: 2.68% | Acumulada: 6.56%
Componente 3: 2.37% | Acumulada: 8.93%
Componente 4: 2.26% | Acumulada: 11.19%
Componente 5: 2.09% | Acumulada: 13.27%
Componente 6: 2.04% | Acumulada: 15.31%
Componente 7: 1.94% | Acumulada: 17.25%
Componente 8: 1.91% | Acumulada: 19.16%
Componente 9: 1.81% | Acumulada: 20.97%
Componente 10: 1.80% | Acumulada: 22.77%
Componente 11: 1.73% | Acumulada: 24.50%
Componente 12: 1.72% | Acumulada: 26.22%
Componente 13: 1.71% | Acumulada: 27.93%
Componente 14: 1.70% | Acumulada: 29.63%
Componente 15: 1.69% | Acumulada: 31.32%
Componente 16: 1.69% | Acumulada: 33.01%
Componente 17: 1.68% | Acumulada: 34.69%
Componente 18: 1.67% | Acumulada: 36.36%
Componente 19: 1.67% | Acumulada: 38.03%
Componente 20: 1.67% | Acumulada: 39.70%
Componente 21: 1.66% | Acumulada: 41.37%
Componente 22: 1.66% | Acumulada: 43.02%
Componente 23: 1.65% | Acumulada: 44.68%
Componente 24: 1.65% | Acumulada: 

### **Integrar todo en un Pipeline completo**

In [14]:
pipeline_transform = Pipeline(stages=[scaler, pca])
pipeline_transform_model=pipeline_transform.fit(df)
df_final=pipeline_transform_model.transform(df)

df_ml=df_final.select("features_pca","valor_del_contrato_log")



                                                                                

### **Guardar rutas**

In [18]:
pipeline_path= "/opt/spark-data/raw/transformation_pipeline_1" # pipeline completo
output_path= "/opt/spark-data/raw/secop_ml_ready1.parquet"
df_ml.write.mode("overwrite").parquet(output_path)
pipeline_transform_model.save(pipeline_path)

                                                                                

In [20]:
spark.stop()