In [1]:
# %%
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, PCA, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import col
from pyspark.sql import functions as F

# %%
spark = SparkSession.builder \
    .appName("SECOP_Transformaciones") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

print(f"Spark conectado a: {spark.sparkContext.master}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/14 03:15:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark conectado a: spark://spark-master:7077


In [2]:
# Cargar datos transformados del notebook anterior
df = spark.read.parquet("/opt/spark-data/raw/secop_features.parquet")
print(f"Registros: {df.count():,}")
print(f"Columnas: {len(df.columns)}")

                                                                                

Registros: 100,000
Columnas: 18


In [3]:
df.printSchema()

root
 |-- referencia_del_contrato: string (nullable = true)
 |-- valor_del_contrato: string (nullable = true)
 |-- valor_del_contrato_num: double (nullable = true)
 |-- departamento: string (nullable = true)
 |-- tipo_de_contrato: string (nullable = true)
 |-- fecha_de_firma: string (nullable = true)
 |-- fecha_de_firma_ts: timestamp (nullable = true)
 |-- duraci_n_del_contrato: string (nullable = true)
 |-- proveedor_adjudicado: string (nullable = true)
 |-- estado_contrato: string (nullable = true)
 |-- valor_del_contrato_log: double (nullable = true)
 |-- departamento_idx: double (nullable = true)
 |-- tipo_de_contrato_idx: double (nullable = true)
 |-- estado_contrato_idx: double (nullable = true)
 |-- departamento_vec: vector (nullable = true)
 |-- tipo_de_contrato_vec: vector (nullable = true)
 |-- estado_contrato_vec: vector (nullable = true)
 |-- features_raw: vector (nullable = true)



### **Analizar por que normalizar (examinar escalas)**

 - Tomar una muestra de los vectores
 - Inspeccionar los primeros valores del vector

In [4]:
sample = df.select("features_raw").limit(5).collect()
for i, row in enumerate(sample, start=1):
    features_array = row["features_raw"].toArray()
    print(f"\nRegistro {i} - Primeros 10 valores del vector:")
    print(features_array[:10])
    


Registro 1 - Primeros 10 valores del vector:
[16.92902591  0.          0.          0.          0.          1.
  0.          0.          0.          0.        ]

Registro 2 - Primeros 10 valores del vector:
[17.12945439  0.          0.          0.          0.          0.
  0.          0.          0.          1.        ]

Registro 3 - Primeros 10 valores del vector:
[17.90985514  0.          0.          1.          0.          0.
  0.          0.          0.          0.        ]

Registro 4 - Primeros 10 valores del vector:
[17.67360461  0.          0.          0.          0.          0.
  0.          0.          0.          0.        ]

Registro 5 - Primeros 10 valores del vector:
[17.12455614  0.          0.          0.          1.          0.
  0.          0.          0.          0.        ]


26/02/14 03:15:36 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


### **Comparar antes y despues de StandardScaler**

In [7]:
scaler = StandardScaler(inputCol="features_raw",
                       outputCol = "features_scaled",
                       withMean=False,
                       withStd=True)

In [8]:
print("Aplicando StandardScaler")
scaler_model = scaler.fit(df)
df_scaled=scaler_model.transform(df)
print("✓ StandardScaler aplicado a toda la base")
print("Columnas disponibles:")
print(df_scaled.columns)
df_scaled.show()

Aplicando StandardScaler
✓ StandardScaler aplicado a toda la base
Columnas disponibles:
['referencia_del_contrato', 'valor_del_contrato', 'valor_del_contrato_num', 'departamento', 'tipo_de_contrato', 'fecha_de_firma', 'fecha_de_firma_ts', 'duraci_n_del_contrato', 'proveedor_adjudicado', 'estado_contrato', 'valor_del_contrato_log', 'departamento_idx', 'tipo_de_contrato_idx', 'estado_contrato_idx', 'departamento_vec', 'tipo_de_contrato_vec', 'estado_contrato_vec', 'features_raw', 'features_scaled']


[Stage 8:>                                                          (0 + 1) / 1]

+-----------------------+------------------+----------------------+--------------------+--------------------+--------------------+-------------------+---------------------+--------------------+---------------+----------------------+----------------+--------------------+-------------------+----------------+--------------------+-------------------+--------------------+--------------------+
|referencia_del_contrato|valor_del_contrato|valor_del_contrato_num|        departamento|    tipo_de_contrato|      fecha_de_firma|  fecha_de_firma_ts|duraci_n_del_contrato|proveedor_adjudicado|estado_contrato|valor_del_contrato_log|departamento_idx|tipo_de_contrato_idx|estado_contrato_idx|departamento_vec|tipo_de_contrato_vec|estado_contrato_vec|        features_raw|     features_scaled|
+-----------------------+------------------+----------------------+--------------------+--------------------+--------------------+-------------------+---------------------+--------------------+---------------+---------

                                                                                

In [10]:
import numpy as np
import pandas as pd

sample_df = (
    df_scaled
    .select("features_raw", "features_scaled")
    .sample(fraction=0.01, seed=42)
    .limit(1000)
    .toPandas()
)

raw_matrix = np.array([
    row['features_raw'].toArray() for _, row in sample_df.iterrows()
])

scaled_matrix = np.array([
    row['features_scaled'].toArray() for _, row in sample_df.iterrows()
])


print("ANTES (features_raw):")
print(f"Min:  {raw_matrix.min():.2f}")
print(f"Max:  {raw_matrix.max():.2f}")
print(f"Mean: {raw_matrix.mean():.2f}")
print(f"Std:  {raw_matrix.std():.2f}")
print("\nDESPUÉS (features_scaled):")
print(f"Min:  {scaled_matrix.min():.2f}")
print(f"Max:  {scaled_matrix.max():.2f}")
print(f"Mean: {scaled_matrix.mean():.2f}")
print(f"Std:  {scaled_matrix.std():.2f}")


ANTES (features_raw):
Min:  0.00
Max:  24.27
Mean: 0.34
Std:  2.23

DESPUÉS (features_scaled):
Min:  0.00
Max:  72.55
Mean: 0.35
Std:  1.57


In [11]:
df_scaled.select("features_scaled").first()[0]

SparseVector(58, {0: 9.0989, 5: 4.424, 35: 3.2482, 52: 2.0109})

### **Configurar PCA y elegir numero de componentes**

In [12]:
sample_vec = df_scaled.select("features_scaled").first()[0]
num_features = len(sample_vec)

print(f"Número total de features: {num_features}")


Número total de features: 58


In [13]:
k = min(30, num_features)
print(f"Usando k = {k} componentes para PCA")


pca= PCA( k=k,
         inputCol="features_scaled",
         outputCol="features_pca")
print("✓ PCA configurado correctamente")

Usando k = 30 componentes para PCA
✓ PCA configurado correctamente


In [14]:
print("Aplicando PCA")

pca_model = pca.fit(df_scaled)
df_pca = pca_model.transform(df_scaled)

print(f"Dimensión original: {num_features}")
print(f"Dimensión reducida: {k}")

df_pca.select("features_pca").first()[0]

Aplicando PCA


26/02/14 03:18:26 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Dimensión original: 58
Dimensión reducida: 30


DenseVector([-2.8319, -0.895, -5.5373, 2.6422, -0.2096, 0.3457, -1.139, -0.3654, -2.3931, 1.068, -0.7456, -1.1586, -0.948, -1.9727, 0.0143, 1.1813, -0.331, 0.0509, -0.5205, 0.0523, 0.0682, 0.0423, -0.1576, 0.3574, -0.0888, 0.1018, 0.2162, 0.1634, -0.1822, -0.4387])

### **Analizar varianza explicada por componente**

In [15]:
explained_variance = pca_model.explainedVariance

print("\nVARIANZA EXPLICADA POR COMPONENTE")
cumulative_variance = 0

for i, var in enumerate(explained_variance):
    cumulative_variance += var
    print(
        f"Componente {i+1}: "
        f"{var*100:.2f}% | "
        f"Acumulada: {cumulative_variance*100:.2f}%"
    )


VARIANZA EXPLICADA POR COMPONENTE
Componente 1: 3.88% | Acumulada: 3.88%
Componente 2: 3.24% | Acumulada: 7.12%
Componente 3: 2.66% | Acumulada: 9.78%
Componente 4: 2.19% | Acumulada: 11.97%
Componente 5: 2.12% | Acumulada: 14.09%
Componente 6: 1.92% | Acumulada: 16.01%
Componente 7: 1.91% | Acumulada: 17.91%
Componente 8: 1.88% | Acumulada: 19.79%
Componente 9: 1.85% | Acumulada: 21.64%
Componente 10: 1.83% | Acumulada: 23.47%
Componente 11: 1.81% | Acumulada: 25.28%
Componente 12: 1.81% | Acumulada: 27.09%
Componente 13: 1.80% | Acumulada: 28.89%
Componente 14: 1.80% | Acumulada: 30.68%
Componente 15: 1.79% | Acumulada: 32.47%
Componente 16: 1.78% | Acumulada: 34.25%
Componente 17: 1.77% | Acumulada: 36.02%
Componente 18: 1.77% | Acumulada: 37.80%
Componente 19: 1.77% | Acumulada: 39.56%
Componente 20: 1.77% | Acumulada: 41.33%
Componente 21: 1.76% | Acumulada: 43.10%
Componente 22: 1.76% | Acumulada: 44.86%
Componente 23: 1.76% | Acumulada: 46.62%
Componente 24: 1.76% | Acumulada: 

### **Integrar todo en un Pipeline completo**

In [17]:
pipeline_transform = Pipeline(stages=[scaler, pca])
pipeline_transform_model=pipeline_transform.fit(df)
df_final=pipeline_transform_model.transform(df)

df_ml=df_final.select("features_pca","valor_del_contrato_log")



### **Guardar rutas**

In [18]:
pipeline_path= "/opt/spark-data/raw/transformation_pipeline" # pipeline completo
output_path= "/opt/spark-data/raw/secop_ml_ready.parquet"
df_ml.write.mode("overwrite").parquet(output_path)
pipeline_transform_model.save(pipeline_path)

                                                                                

In [19]:
spark.stop()