# Construcción de Pipeline del Modelo

## Importaciones

In [1]:
# Manejo de datos y operaciones
import numpy as np
import pandas as pd

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesamiento de datos
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE


# Modelos de machine learning
from sklearn.ensemble import RandomForestClassifier

# Métricas
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Manejo de archivos
import os
from pathlib import Path

# Miscellaneous
import warnings
warnings.filterwarnings('ignore') 



## Lectura de los Datos

In [2]:
#Cargar los datos despues  

df = pd.read_csv("../stage/train.csv")

In [3]:
# Separar las características (X) y la variable objetivo (y)
X = df.drop(columns=['target'])  # Excluir la columna target
y = df['target']  # Variable objetivo

# Dividir el conjunto en train + validation (80%) y test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Dividir el conjunto de train en train (80% de train) y validation (20% de train)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [4]:
smote = SMOTE(random_state=42)

# --- Random Forest (sin escalado, pero con SMOTE) ---
x_rf_train, y_rf_train = smote.fit_resample(X_train, y_train)
x_rf_val, y_rf_val = X_val, y_val  # Sin escalado
x_rf_test, y_rf_test = X_test, y_test  # Sin escalado

## Construcción del Pipeline

In [5]:
class LabelEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
    
    def fit(self, X, y=None):
        for col in X.columns:
            self.encoders[col] = LabelEncoder()
            self.encoders[col].fit(X[col])
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col in X.columns:
            X_copy[col] = self.encoders[col].transform(X[col])
        return X_copy

In [6]:
# Identificar columnas categóricas y numéricas
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

# Paso 1: Transformador para columnas categóricas
categorical_transformer = Pipeline(steps=[
    ('label_encoder', LabelEncoderWrapper())  # Wrapper para usar LabelEncoder en pipelines
])

# Paso 2: Combinación de transformadores con ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # Las columnas numéricas se quedan igual
)

# Paso 3: Definir el pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Entrenar el pipeline
pipeline.fit(x_rf_train, y_rf_train)

# Predecir y evaluar
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8433001107419712
Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.50      0.46       246
           1       0.92      0.90      0.91      1560

    accuracy                           0.84      1806
   macro avg       0.68      0.70      0.69      1806
weighted avg       0.85      0.84      0.85      1806



In [7]:
# Se procede a guardar el segundo modelo para su despliegue posterior
joblib.dump(pipeline, '../../deploy/rf_pipeline.joblib')

['../../deploy/rf_pipeline.joblib']