### Análisis de datos - Clase 6

#### Manejo de problemas de drift al codificar y normalizar

In [1091]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

1. Cargamos el dataset de pingũinos

In [1092]:
pinguinos_df = sns.load_dataset('penguins')
pinguinos_df.dropna(inplace=True) # Elimino faltantes para simplificar el ejemplo

pinguinos_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [1093]:
pinguinos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


In [1094]:
pinguinos_df['island'].value_counts()

island
Biscoe       163
Dream        123
Torgersen     47
Name: count, dtype: int64

2. Supongamos que queremos predecir la especie (target) a partir de sus características físicas. 

Para el ejemplo, voy a armar un dataset de test que tiene una categoría de isla que no está en train.

* Me quedo con las islas Biscoe y Dream solamente.
* Separo en train y test.
* A test le agrego 3 filas de la isla Torgersen, que no están en train.



In [1096]:
# Filtro el dataset principal para que quede con solo dos islas: Biscoe y Dream
pinguinos_sin_torgersen = pinguinos_df[pinguinos_df['island'] != 'Torgersen'].copy() 
# Ajusto tipos 
pinguinos_sin_torgersen['species'] = pinguinos_sin_torgersen['species'].astype('category')
pinguinos_sin_torgersen['island'] = pinguinos_sin_torgersen['island'].astype('category')
pinguinos_sin_torgersen['sex'] = pinguinos_sin_torgersen['sex'].astype('category')

# Me voy a guardar 3 filas de datos de Torgersen para queden solo en test
datos_torgersen = pinguinos_df[pinguinos_df['island'] == 'Torgersen'].copy() 
datos_torgersen = datos_torgersen.head(3) # Dejo 3 filas para agregar a test después
# Ajusto tipos
datos_torgersen['species'] = datos_torgersen['species'].astype('category')
datos_torgersen['island'] = datos_torgersen['island'].astype('category')
datos_torgersen['sex'] = datos_torgersen['sex'].astype('category')

y_torgersen = datos_torgersen['species'].copy()  # Separo la variable target
X_torgersen = datos_torgersen.drop(columns=['species']).copy()  # Los features

# Separo la variable target y los features
y = pinguinos_sin_torgersen['species'].copy() 
X = pinguinos_sin_torgersen.drop(columns=['species']).copy()

# Hago el split de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Agrego los datos de Torgersen a test
X_test = pd.concat([X_test, X_torgersen], ignore_index=True)  # Agrego los datos de Torgersen a test
y_test = pd.concat([y_test, y_torgersen], ignore_index=True)


In [1097]:
X_test.tail()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
56,Biscoe,48.7,14.1,210.0,4450.0,Female
57,Dream,49.7,18.6,195.0,3600.0,Male
58,Torgersen,39.1,18.7,181.0,3750.0,Male
59,Torgersen,39.5,17.4,186.0,3800.0,Female
60,Torgersen,40.3,18.0,195.0,3250.0,Female


In [1098]:
print(f"Categorías en island en el dataset de Train:{X_train['island'].unique().tolist()}")    
print(f"Categorías en island en el dataset de Test:{X_test['island'].unique().tolist()}")

Categorías en island en el dataset de Train:['Dream', 'Biscoe']
Categorías en island en el dataset de Test:['Dream', 'Biscoe', 'Torgersen']


In [1099]:
X_train.columns

Index(['island', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm',
       'body_mass_g', 'sex'],
      dtype='object')

In [1100]:
# Defino las columnas para aplicar el codificador solo a lo que necesito

categoricas = ['island', 'sex']
numericas = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm','body_mass_g']

In [1101]:

# Inicializar el OneHotEncoder
ohe = OneHotEncoder(sparse_output=False) # Le pongo este flag para que me devuelva un array en vez de sparse matrix

# Fit + transform para train 
encoded_categoricas_train_array = ohe.fit_transform(X=X_train[categoricas])

# Recuperar los nombres de las columnas transformadas con ohe
columnas_ohe_train = ohe.get_feature_names_out(categoricas)

# Armo el df con las columnas codificadas
encoded_categoricas_train = pd.DataFrame(encoded_categoricas_train_array, columns=columnas_ohe_train)

# Combino con las numéricas para formar el df original pero codificado
encoded_X_train = pd.concat([X_train[numericas].reset_index(drop=True), encoded_categoricas_train.reset_index(drop=True)], axis=1)

encoded_X_train.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,sex_Female,sex_Male
0,49.2,18.2,195.0,4400.0,0.0,1.0,0.0,1.0
1,46.6,17.8,193.0,3800.0,0.0,1.0,1.0,0.0
2,41.0,20.0,203.0,4725.0,1.0,0.0,0.0,1.0
3,50.0,15.9,224.0,5350.0,1.0,0.0,0.0,1.0
4,58.0,17.8,181.0,3700.0,0.0,1.0,1.0,0.0


Ahora codificamos el dataset de test:

In [None]:
# Aplico el transform al dataset de test
try:
    encoded_categoricas_test_array = ohe.transform(X=X_test[categoricas]) # Ojo! hacemos solo transform!
except ValueError as e:
    print(f"Se rompió la codificación en test!\nDescripción del error: {e}")


Se rompió la codificación en test!
Descripción del error: Found unknown categories ['Torgersen'] in column 0 during transform


---
#### Posible solución - ignorar categorías desconocidas
---

In [1103]:
# Inicializar el OneHotEncoder 
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # Le decimos que ignore las categorías desconocidas

# Fit + transform para train 
encoded_categoricas_train_array = ohe.fit_transform(X=X_train[categoricas])

# Recuperar los nombres de las columnas transformadas con ohe
columnas_ohe_train = ohe.get_feature_names_out(categoricas)

# Armo el df con las columnas codificadas
encoded_categoricas_train = pd.DataFrame(encoded_categoricas_train_array, columns=columnas_ohe_train)

# Combino con las numéricas para formar el df original pero codificado
encoded_X_train = pd.concat([X_train[numericas].reset_index(drop=True), encoded_categoricas_train.reset_index(drop=True)], axis=1)

encoded_X_train.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,sex_Female,sex_Male
0,49.2,18.2,195.0,4400.0,0.0,1.0,0.0,1.0
1,46.6,17.8,193.0,3800.0,0.0,1.0,1.0,0.0
2,41.0,20.0,203.0,4725.0,1.0,0.0,0.0,1.0
3,50.0,15.9,224.0,5350.0,1.0,0.0,0.0,1.0
4,58.0,17.8,181.0,3700.0,0.0,1.0,1.0,0.0


In [None]:
# Aplicamos la codificación al dataset de test
try:
    encoded_categoricas_test_array = ohe.transform(X=X_test[categoricas]) # Ojo! hacemos solo transform!
    print("Codificación exitosa!")
except ValueError as e:
    print(f"Se rompió la codificación en test!\nDescripción del error: {e}")


Codificación exitosa!


Reconstruyamos el dataset de test para ver qué hizo:

In [1105]:
# Recuperar los nombres de las columnas transformadas con ohe
columnas_ohe_test = ohe.get_feature_names_out(categoricas)

# Armo el df con las columnas codificadas
encoded_categoricas_test = pd.DataFrame(encoded_categoricas_test_array, columns=columnas_ohe_test)

# Combino con las numéricas para formar el df original pero codificado
encoded_X_test = pd.concat([X_test[numericas].reset_index(drop=True), encoded_categoricas_test.reset_index(drop=True)], axis=1)

encoded_X_test.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,sex_Female,sex_Male
0,38.3,19.2,189.0,3950.0,0.0,1.0,0.0,1.0
1,46.4,15.6,221.0,5000.0,1.0,0.0,0.0,1.0
2,47.5,15.0,218.0,4950.0,1.0,0.0,1.0,0.0
3,47.3,15.3,222.0,5250.0,1.0,0.0,0.0,1.0
4,38.9,18.8,190.0,3600.0,0.0,1.0,1.0,0.0


In [1106]:
encoded_X_test.tail()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,sex_Female,sex_Male
56,48.7,14.1,210.0,4450.0,1.0,0.0,1.0,0.0
57,49.7,18.6,195.0,3600.0,0.0,1.0,0.0,1.0
58,39.1,18.7,181.0,3750.0,0.0,0.0,0.0,1.0
59,39.5,17.4,186.0,3800.0,0.0,0.0,1.0,0.0
60,40.3,18.0,195.0,3250.0,0.0,0.0,1.0,0.0


Vemos en las últimas 3 filas que corresponden a la isla Torgersen, que Briscoe y Dream están en cero y no hay una columna para Torgersen (se ignora la categoría desconocida).


---
### Problemas con el escalamiento
---

In [1112]:
# Hago un describe para ver los máx y mín de las variables numéricas
X_train.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,228.0,228.0,228.0,228.0
mean,44.838596,16.984649,202.77193,4307.565789
std,5.478392,1.968873,14.647419,837.272091
min,32.1,13.1,172.0,2850.0
25%,40.575,15.3,190.0,3593.75
50%,45.45,17.1,200.5,4150.0
75%,49.425,18.6,215.0,4956.25
max,59.6,21.2,231.0,6300.0


Para el ejemplo, vamos a modificar el valor de la variable body_mass_g (primera fila del dataset del test), para que sea mayor que 6300.

In [1113]:
X_test.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,61.0,61.0,61.0,61.0
mean,44.434426,16.890164,200.819672,4185.655738
std,4.870417,1.989364,13.207962,752.917368
min,35.0,13.2,178.0,2700.0
25%,40.2,15.0,190.0,3650.0
50%,46.1,17.3,197.0,4050.0
75%,48.2,18.7,211.0,4625.0
max,52.7,20.3,228.0,5650.0


Vemos que el min de body_mass_g en test (2700) es menor que el de train (2850), ahora voy a cambiar el máximo también para el ejemplo (el máx de la primera fila)

In [1114]:
X_test.head(1)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Dream,38.3,19.2,189.0,3950.0,Male


In [1115]:
X_test.loc[0, 'body_mass_g'] = 10000

In [1116]:
X_test.head(1)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Dream,38.3,19.2,189.0,10000.0,Male


In [1117]:
X_test.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,61.0,61.0,61.0,61.0
mean,44.434426,16.890164,200.819672,4284.836066
std,4.870417,1.989364,13.207962,1058.017788
min,35.0,13.2,178.0,2700.0
25%,40.2,15.0,190.0,3650.0
50%,46.1,17.3,197.0,4100.0
75%,48.2,18.7,211.0,4700.0
max,52.7,20.3,228.0,10000.0


In [1118]:
scaler = MinMaxScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numericas]), columns=numericas) # Fit + transform para train

X_train_scaled.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,228.0,228.0,228.0,228.0
mean,0.463222,0.479586,0.521558,0.422483
std,0.199214,0.243071,0.248261,0.242688
min,0.0,0.0,0.0,0.0
25%,0.308182,0.271605,0.305085,0.21558
50%,0.485455,0.493827,0.483051,0.376812
75%,0.63,0.679012,0.728814,0.610507
max,1.0,1.0,1.0,1.0


In [1119]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numericas]), columns=numericas) # Transform en test

X_test_scaled.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,61.0,61.0,61.0,61.0
mean,0.448525,0.467921,0.488469,0.415895
std,0.177106,0.2456,0.223864,0.306672
min,0.105455,0.012346,0.101695,-0.043478
25%,0.294545,0.234568,0.305085,0.231884
50%,0.509091,0.518519,0.423729,0.362319
75%,0.585455,0.691358,0.661017,0.536232
max,0.749091,0.888889,0.949153,2.072464


Ahora vemos que al escalar, quedaron dos valores fuera del rango [0,1] en body_mass_g.

#### Posible solución - usar clip = true

In [1120]:
scaler = MinMaxScaler(clip=True)

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numericas]), columns=numericas) # Fit + transform para train
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numericas]), columns=numericas) # Transform para test

X_test_scaled.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,61.0,61.0,61.0,61.0
mean,0.448525,0.467921,0.488469,0.399026
std,0.177106,0.2456,0.223864,0.230377
min,0.105455,0.012346,0.101695,0.0
25%,0.294545,0.234568,0.305085,0.231884
50%,0.509091,0.518519,0.423729,0.362319
75%,0.585455,0.691358,0.661017,0.536232
max,0.749091,0.888889,0.949153,1.0


Ahora se observa que los valores que salían fuera del rango fueron recortados al máximo (1) y mínimo del rango (0)

Cuando hay drift o distinta distribución en test, también puede ocurrir que el efecto de escalar con StadardScaler no sea el esperado. La distribución resultante podría no tener media 0, no tener desviación estándar 1, o incluso una forma distinta a la normal. 

In [1121]:
std_scaler = StandardScaler()

X_train_scaled = pd.DataFrame(std_scaler.fit_transform(X_train[numericas]), columns=numericas) # Fit + transform para train
X_test_scaled = pd.DataFrame(std_scaler.transform(X_test[numericas]), columns=numericas) # Transform para test

In [1122]:
X_train_scaled.describe().round(3)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,228.0,228.0,228.0,228.0
mean,-0.0,0.0,0.0,0.0
std,1.002,1.002,1.002,1.002
min,-2.33,-1.977,-2.105,-1.745
25%,-0.78,-0.858,-0.874,-0.854
50%,0.112,0.059,-0.155,-0.189
75%,0.839,0.822,0.837,0.776
max,2.7,2.146,1.931,2.385


In [1123]:
X_test_scaled.describe().round(3)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,61.0,61.0,61.0,61.0
mean,-0.074,-0.048,-0.134,-0.027
std,0.891,1.013,0.904,1.266
min,-1.8,-1.926,-1.695,-1.924
25%,-0.849,-1.01,-0.874,-0.787
50%,0.231,0.161,-0.395,-0.248
75%,0.615,0.873,0.563,0.47
max,1.438,1.688,1.726,6.814


#### El tratamiento de outliers puede ayudar

In [1124]:
# Ejemplo - vamos a tratar outliers solamente para la variable 'body_mass_g'

Q1 = X_train['body_mass_g'].quantile(0.25)
Q3 = X_train['body_mass_g'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - (1.5 * IQR)
limite_superior = Q3 + (1.5 * IQR)

# Funcioncita para corregir outliers
def corregir_outliers_iqr(columna, inferior, superior):
    return columna.clip(lower=inferior, upper=superior)

# Corregimos outliers en train y test (solo para 'body_mass_g')
X_train_sin_outliers = X_train.copy()
X_train_sin_outliers['body_mass_g'] = corregir_outliers_iqr(X_train_sin_outliers['body_mass_g'], limite_inferior, limite_superior)

X_test_sin_outliers = X_test.copy()
X_test_sin_outliers['body_mass_g'] = corregir_outliers_iqr(X_test_sin_outliers['body_mass_g'], limite_inferior, limite_superior)

In [1125]:
std_scaler = StandardScaler()

# Volvemos a aplicar el escalamiento
X_train_sin_outliers_scaled = pd.DataFrame(std_scaler.fit_transform(X_train_sin_outliers[numericas]), columns=numericas) # Fit + transform para train
X_test_sin_outliers_scaled = pd.DataFrame(std_scaler.transform(X_test_sin_outliers[numericas]), columns=numericas) # Transform para test

In [1126]:
X_test_sin_outliers_scaled.describe().round(3)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,61.0,61.0,61.0,61.0
mean,-0.074,-0.048,-0.134,-0.086
std,0.891,1.013,0.904,0.998
min,-1.8,-1.926,-1.695,-1.924
25%,-0.849,-1.01,-0.874,-0.787
50%,0.231,0.161,-0.395,-0.248
75%,0.615,0.873,0.563,0.47
max,1.438,1.688,1.726,3.223
