### Pipelines de operaciones

Agrupar múltiples operaciones en un mismo objeto

* Imputar nulos
* Codificación de categóricos
* Escalada de datos
* Modelado

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


## Carga de datos

In [18]:
df = sns.load_dataset('penguins')
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

## Nulos en la columna de salida

In [None]:
df['body_mass_g'] = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(df[['body_mass_g']])

## Preparar datos (X, y)

In [23]:
# Preparar X (entrada), y (salida)
# imputar nulos en la columna de salida (y)

X = df.drop('body_mass_g', axis=1)
y = df['body_mass_g']


### Pipeline de columnas numéricas

In [24]:
# Columnas numéricas
pipeline_numeric = Pipeline([
    ('impute_median', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
numeric_col_names = X.select_dtypes(include=np.number).columns.to_list()
numeric_col_names

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']

### Pipeline de columnas categóricas

In [25]:
pipeline_categorical = Pipeline([
    ('impute_mode', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse=False))
])
categorical_col_names = X.select_dtypes(include='object').columns.to_list()
categorical_col_names

['species', 'island', 'sex']

### Pipeline numéricas + categóricas

In [27]:

prepocessor = ColumnTransformer([
    ('numerical', pipeline_numeric, numeric_col_names),
    ('categorical', pipeline_categorical, categorical_col_names)
])
prepocessor


### Pipeline numéricas + categóricas + modelado

In [29]:
pipeline = Pipeline([
    ('prepocessor', prepocessor),
    ('linear_regression', LinearRegression())
    # ('Knn', KNeigborsRegressor())
]) 
pipeline

### Ejecución pipeline

In [34]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mean_squared_error(y_test, y_pred, squared=False)



318.79717144386433

### Guardar pipeline

SE USA JOBLIB PARA GUARDAR MODELOS O PIPELINES

* joblib.dump() o joblib ***guardar***
* joblib.load() ***descargar***

In [37]:
import joblib

joblib.dump(pipeline, 'pipeline.pkl') # más genérico
joblib.dump(pipeline, 'pipeline.joblib') # más eficiente con muchos datos


['pipeline.joblib']

### Cargar pipeline

In [36]:
pipeline = joblib.load('pipeline.pkl')
pipeline.predict(X_test)[:10]

array([4006.44704878, 3446.65045285, 4652.08396283, 3267.32367752,
       4623.83146849, 5152.74949404, 5560.00223302, 4544.23407991,
       3655.96209525, 4644.37891153])

In [38]:
pipeline = joblib.load('pipeline.joblib')
pipeline.predict(X_test)[:10]

array([4006.44704878, 3446.65045285, 4652.08396283, 3267.32367752,
       4623.83146849, 5152.74949404, 5560.00223302, 4544.23407991,
       3655.96209525, 4644.37891153])

### Crear transformador personalizado (avanzado)

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin

class Debugger(BaseEstimator, TransformerMixin):
    def __init__(self, message):
        self.message = message
        
    def fit(self, X, y=None):
        
        print('Hola desde fit')
        self.param1 = 'Hola'
        return self
        
    def transform(self, X, y=None):
        
        print('Hola desde transform')
        return X  
       
    

### Usar transformador personalizado

In [46]:

pipeline = Pipeline([
    ('Debugger1', Debugger('Hola')),
    ('prepocessor', prepocessor),
    ('linear_regression', LinearRegression())
    # ('Knn', KNeigborsRegressor())
]) 
pipeline


### Transformador peersonalizado OutliersRemover

In [47]:

from sklearn.base import BaseEstimator, TransformerMixin

class OutliersRemover(BaseEstimator, TransformerMixin):
    
    def __init__(self, factor=1.5):
        self.factor = factor
        
    def fit(self, X, y=None):
        self.Q1 = np.percentile(X, 25, axis=0) 
        self.Q3 = np.percentile(X, 75, axis=0)
        self.IQR = self.Q3 - self.Q1
        return self
        
    def transform(self, X, y=None):
        inferior_limit = self.Q1 - self.factor * self.IQR
        superior_limit = self.Q3 + self.factor * self.IQR
        
        mask = (X >= inferior_limit) & (X <= superior_limit)
        X_copy = X[mask].copy()
        
        print(f'X.shape: {X.shape}')
        print(f'X_copy.shape: {X_copy.shape}')
        return X[mask]
    
    

In [None]:
pipeline_numeric = Pipeline([
    ('impute_median', SimpleImputer(strategy='median')),
    ('outliers_remove', OutliersRemover())
    # ('scaler', MinMaxScaler())
])
numeric_col_names = X.select_dtypes(include=np.number).columns.to_list()
numeric_col_names