In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
data_df = pd.DataFrame({
    'age': [25, 32, np.nan, 29, 45, 23],
    'salary': [50000, 60000, 80000, 45000, np.nan, 30000],
    'department': ['HR', 'Finance', 'HR', 'Finance', 'HR', 'IT'],
    'target': [3000, 4000, 5000, 3500, 4500, 2500]  # Continuous target variable
})

test_df = pd.DataFrame({
    'age': [30, 43],
    'salary': [np.nan, 35000],
    'department': ['HR', 'IT'],
    'target': [4500, 2500]
})

In [None]:
data_df.head()

Unnamed: 0,age,salary,department,target
0,25.0,50000.0,HR,3000
1,32.0,60000.0,Finance,4000
2,,80000.0,HR,5000
3,29.0,45000.0,Finance,3500
4,45.0,,HR,4500


In [None]:
X = data_df.drop('target', axis=1)
y = data_df['target']

X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [None]:
print(X.shape)
print(y.shape)

(6, 3)
(6,)


# Pipeline

In [None]:
variables_numericas = X.select_dtypes(include=['int', 'float']).columns.tolist()
print(variables_numericas)

['age', 'salary']


In [None]:
variables_categoricas = X.select_dtypes(include=['object']).columns.tolist()
print(variables_categoricas)

['department']


## Pipeline de preprocesamiento

### Variables numéricas

In [None]:
from sklearn.pipeline import Pipeline

numeric_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

#### Ejemplo de transformación en data de entrenamiento

In [None]:
X_numeric = X[variables_numericas]

In [None]:
X_numeric_transf = numeric_preprocessor.fit_transform(X_numeric)
X_numeric_transf

array([[-0.81915239, -0.19781414],
       [ 0.1694798 ,  0.46156633],
       [ 0.        ,  1.78032728],
       [-0.25421971, -0.52750438],
       [ 2.00551102,  0.        ],
       [-1.10161873, -1.51657509]])

#### Ejemplo en data de prueba

In [None]:
X_test_numeric = test_df[variables_numericas]
X_test_numeric_transf = numeric_preprocessor.transform(X_test_numeric)

In [None]:
X_test_numeric_transf

array([[-0.11298654,  0.        ],
       [ 1.72304468, -1.18688485]])

### Variables categóricas

In [None]:
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

#### Ejemplo de transformación

In [None]:
X_train_categ = X[variables_categoricas]

X_train_cat_tran = categorical_preprocessor.fit_transform(X_train_categ).toarray()
X_train_cat_tran

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [None]:
X_test_cat = test_df[variables_categoricas]
X_test_cat_transf = categorical_preprocessor.transform(X_test_cat).toarray()
X_test_cat_transf

array([[0., 1., 0.],
       [0., 0., 1.]])

## Column Transformer

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        #('numeric-transformer', StandardScaler(), variables_numericas),
        ('numeric-transformer', numeric_preprocessor, variables_numericas),
        ('categ-transformer', categorical_preprocessor, variables_categoricas)
    ]
)

In [None]:
X_train_transformed = preprocessor.fit_transform(X)

In [None]:
X_train_transformed

array([[-0.81915239, -0.19781414,  0.        ,  1.        ,  0.        ],
       [ 0.1694798 ,  0.46156633,  1.        ,  0.        ,  0.        ],
       [ 0.        ,  1.78032728,  0.        ,  1.        ,  0.        ],
       [-0.25421971, -0.52750438,  1.        ,  0.        ,  0.        ],
       [ 2.00551102,  0.        ,  0.        ,  1.        ,  0.        ],
       [-1.10161873, -1.51657509,  0.        ,  0.        ,  1.        ]])

In [None]:
X_test_transformed = preprocessor.fit_transform(X_test)
X_test_transformed

array([[-1.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  1.]])

# Pipeline de entrenamiento

In [None]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
#lr1 = LinearRegression()
#lr2 = LinearRegression(fit_intercept=False)

In [None]:
model_pipeline.fit(X, y)

# Ejercicio

1. Investigar más sobre data leakage (filtrado de datos)