In [3]:
import logging
from data_loader import load_data
from data_cleaning import clean_data
from data_transformation import transform_data
from feature_engineering import transform_and_split_data
import pandas as pd
import scipy.sparse
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

OUTPUT_DIR = Path().resolve().parent.parent.joinpath("data", "processed")

def save_data(X_train, X_val, X_test, y_train, y_val, y_test, output_dir=OUTPUT_DIR):
    # Guarda las variables objetivo (y) como CSV
    y_train.to_csv(f"{output_dir}/y_train.csv", index=False)
    y_val.to_csv(f"{output_dir}/y_val.csv", index=False)
    y_test.to_csv(f"{output_dir}/y_test.csv", index=False)

    # Guarda los features (X) como Parquet o CSV
    if isinstance(X_train, scipy.sparse.spmatrix):
        scipy.sparse.save_npz(f"{output_dir}/X_train.npz", X_train)
        scipy.sparse.save_npz(f"{output_dir}/X_val.npz", X_val)
        scipy.sparse.save_npz(f"{output_dir}/X_test.npz", X_test)
    else:
        pd.DataFrame(X_train).to_parquet(f"{output_dir}/X_train.parquet")
        pd.DataFrame(X_val).to_parquet(f"{output_dir}/X_val.parquet")
        pd.DataFrame(X_test).to_parquet(f"{output_dir}/X_test.parquet")

    logging.info(f"✅ Datos guardados en {output_dir}")


# def run_data_preprocessing_pipeline()

logging.info("Iniciando pipeline de preprocesamiento...")

logging.info(f"📊 1. Carga de datos.")
data = load_data("clasificacion_siniestros.csv")
logging.info(f"✅ Datos cargados: {data.shape[0]} filas y {data.shape[1]} columnas.")

logging.info(f"🧹 2. Limpieza de datos.")
data = clean_data(data.copy())
logging.info(f"✅ Proceso de limpieza finalizado.")


logging.info(f"🔄 3. Transformación de datos.")
data = transform_data(data.copy())
logging.info(f"✅ Proceso de transformación finalizado.")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

2025-03-02 12:59:52,882 - INFO - PyTorch version 2.4.1+cu124 available.
2025-03-02 12:59:52,882 - INFO - TensorFlow version 2.10.1 available.
2025-03-02 12:59:54,776 - INFO - Iniciando pipeline de preprocesamiento...
2025-03-02 12:59:54,776 - INFO - 📊 1. Carga de datos.
2025-03-02 12:59:54,781 - INFO - 	Cargando datos desde: D:\OneDrive - Universidad de La Salle\Maestría IA\S4\Desarrollo de soluciones\Proyecto\Repo desarrollo soluciones\data\raw\clasificacion_siniestros.csv
2025-03-02 12:59:55,550 - INFO - ✅ Datos cargados: 63164 filas y 66 columnas.
2025-03-02 12:59:55,560 - INFO - 🧹 2. Limpieza de datos.
2025-03-02 12:59:55,571 - INFO - 	Iniciando limpieza: (63164, 66) registros.
2025-03-02 12:59:55,904 - INFO - 	Columnas eliminadas por alto porcentaje de nulos (>50%): ['id_act_economica_igdacmlmasolicitudes', 'fecha_muerte_igatepmafurat', 'muerte_posterior_igatepmafurat', 'fecha_aviso_muerte_igatepmafurat']
2025-03-02 12:59:55,992 - INFO - 	Total registros eliminados por outliers: 2

In [4]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import torch

class HighCardinalityEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, high_cardinality_cols):
        self.high_cardinality_cols = high_cardinality_cols
        self.mappings = {}

    def fit(self, X, y=None):
        for col in self.high_cardinality_cols:
            self.mappings[col] = X[col].value_counts(normalize=True).to_dict()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.high_cardinality_cols:
            X[col + '_freq'] = X[col].map(lambda x: self.mappings[col].get(x, 0))
        return X.drop(columns=self.high_cardinality_cols)

class TextEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = SentenceTransformer(model_name, device=self.device)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Asegurar que X es una Serie (columna única)
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]

        X = X.fillna('missing').astype(str)

        try:
            embeddings = self.model.encode(
                X.tolist(),
                convert_to_numpy=True,
                device=self.device,
                batch_size=32  # Ajustar el tamaño de batch si es necesario
            )
        except Exception as e:
            print(f"Error al procesar embeddings: {e}")
            embeddings = np.zeros((len(X), self.model.get_sentence_embedding_dimension()))

        return embeddings

def detect_column_types(df, target_col, high_cardinality_threshold=20):
    text_col = "descripcion_at_igatepmafurat"

    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

    exclude_cols = {text_col, target_col}
    categorical_cols = [col for col in categorical_cols if col not in exclude_cols]

    high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() > high_cardinality_threshold]

    categorical_cols = [col for col in categorical_cols if col not in high_cardinality_cols]

    return numerical_cols, categorical_cols, high_cardinality_cols, text_col

def create_feature_engineering_pipeline(df):
    numerical_cols, categorical_cols, high_cardinality_cols, text_col = detect_column_types(df, target_col='origen_igdactmlmacalificacionorigen')

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    high_cardinality_transformer = Pipeline(steps=[
        ('high_cardinality', HighCardinalityEncoder(high_cardinality_cols))
    ])

    text_transformer = Pipeline(steps=[
        ('embedder', TextEmbeddingTransformer())
    ])

    feature_engineering = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols),
            ('high_card', high_cardinality_transformer, high_cardinality_cols),
            ('text', text_transformer, [text_col]),
        ],
        remainder='drop'
    )

    preprocessing_pipeline = Pipeline(steps=[
        ('features', feature_engineering)
    ])

    return preprocessing_pipeline

def split_data(df, target_column, test_size=0.2, val_size=0.1, random_state=42):
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    train_df, val_df = train_test_split(train_df, test_size=val_size / (1 - test_size), random_state=random_state)
    return train_df, val_df, test_df

def transform_and_split_data(df, target_column='origen_igdactmlmacalificacionorigen'):
    train_df, val_df, test_df = split_data(df, target_column)

    X_train, y_train = train_df.drop(columns=[target_column]), train_df[target_column]
    X_val, y_val = val_df.drop(columns=[target_column]), val_df[target_column]
    X_test, y_test = test_df.drop(columns=[target_column]), test_df[target_column]

    pipeline = create_feature_engineering_pipeline(X_train)

    X_train_transformed = pipeline.fit_transform(X_train)
    X_val_transformed = pipeline.transform(X_val)
    X_test_transformed = pipeline.transform(X_test)

    return X_train_transformed, y_train, X_val_transformed, y_val, X_test_transformed, y_test

x_train, y_train, x_val, y_val, x_test, y_test = transform_and_split_data(data.copy())


2025-03-02 13:00:01,802 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-03-02 13:00:05,140 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Batches:   0%|          | 0/1196 [00:00<?, ?it/s]

Batches:   0%|          | 0/171 [00:00<?, ?it/s]

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

In [6]:
x_train.shape


(38246, 487)

In [None]:

logging.info(f"🔧 4. Ingeniería de características y partición de datos")
x_train, y_train, x_val, y_val, x_test, y_test = transform_and_split_data(data.copy())
logging.info(f"✅ Proceso de ingeniería de características y partición de datos finalizado.")

logging.info(f"📦 5. Guardando datos procesados.")
save_data(x_train, x_val, x_test, y_train, y_val, y_test)

