## Análisis de datos - Clase 5

### Técnicas avanzadas para el tratamiento de outliers

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
# Cargamos el dataset de Titanic de Seaborn
titanic = sns.load_dataset("titanic")[['age', 'fare']].dropna()

# split 80%/20% (sin target definido)
df, test_df = train_test_split(titanic, test_size=0.2, random_state=42)

print(df.shape, test_df.shape)

(571, 2) (143, 2)


In [3]:
df.head(10)

Unnamed: 0,age,fare
328,31.0,20.525
73,26.0,14.4542
253,30.0,16.1
719,33.0,7.775
666,25.0,13.0
30,40.0,27.7208
287,22.0,7.8958
217,42.0,27.0
797,31.0,8.6833
371,18.0,6.4958


In [4]:
df.describe()

Unnamed: 0,age,fare
count,571.0,571.0
mean,30.016935,35.07856
std,14.728887,49.575809
min,0.42,0.0
25%,21.0,8.05
50%,28.5,15.75
75%,39.0,34.375
max,80.0,512.3292


### Detectamos outliers en base al rango intercuartil

Un dato se considera outlier si es < (Q1 - 1.5 * IQR)) o > (Q3 + 1.5 * IQR)

In [5]:
# Métodos estadísticos para detectar outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)

IQR = Q3 - Q1
outliers_iqr = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
print(f"Outliers\nAge: <{Q1['age'] - 1.5 * IQR['age']:.02f} o >{Q3['age'] + 1.5 * IQR['age']:.02f}")
print(f"Fare: <{Q1['fare'] - 1.5 * IQR['fare']:.02f} o >{Q3['fare'] + 1.5 * IQR['fare']:.02f}")

Outliers
Age: <-6.00 o >66.00
Fare: <-31.44 o >73.86


### Estrategias de eliminación y tratamiento de outliers

#### 3. Imputación

#### Imputación con KNN

In [6]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [7]:
df_nan = df.copy()
df_nan[outliers_iqr] = np.nan  # marca outliers como NaN

In [8]:
# Prerequisito para KNNImputer: escalamiento

scaler = MinMaxScaler()
titanic_2_escalado = scaler.fit_transform(df_nan)

In [9]:
# Definimos el modelo
imputer = KNNImputer(n_neighbors=5)

# Aplicamos la imputación
df_knn_imputed = pd.DataFrame(imputer.fit_transform(titanic_2_escalado),
                              columns=df.columns,
                              index=df_nan.index # sin esto, se pierde el índice original
                             )

# "Desescalamos" los datos imputados
titanic_imputado_knn = pd.DataFrame(
    scaler.inverse_transform(df_knn_imputed),
    columns=df.columns,
    index=df.index
)

print(f"Dataset imputado con KNN (postprocesado):\n {titanic_imputado_knn.head(10)}")


titanic_imputado_knn.describe()

Dataset imputado con KNN (postprocesado):
       age     fare
328  31.0  20.5250
73   26.0  14.4542
253  30.0  16.1000
719  33.0   7.7750
666  25.0  13.0000
30   40.0  27.7208
287  22.0   7.8958
217  42.0  27.0000
797  31.0   8.6833
371  18.0   6.4958


Unnamed: 0,age,fare
count,571.0,571.0
mean,29.510455,19.92071
std,13.968681,14.52336
min,0.42,0.0
25%,21.0,8.05
50%,28.0,14.5
75%,38.0,26.2875
max,66.0,73.5


In [13]:
# comparo las observaciones originales con las imputaciones
df_compara_knn = pd.concat([df, df_nan, df_knn_imputed, titanic_imputado_knn], axis=1)

df_compara_knn.columns = ['age_original', 'fare_original', 'age_nan', 'fare_nan', 'age_imputado_scale', 'fare_imputado_scale', 'age_imputado', 'fare_imputado']

df_compara_knn.head(30) # se observan imputaciones de Fare

Unnamed: 0,age_original,fare_original,age_nan,fare_nan,age_imputado_scale,fare_imputado_scale,age_imputado,fare_imputado
328,31.0,20.525,31.0,20.525,0.466301,0.279252,31.0,20.525
73,26.0,14.4542,26.0,14.4542,0.390058,0.196656,26.0,14.4542
253,30.0,16.1,30.0,16.1,0.451052,0.219048,30.0,16.1
719,33.0,7.775,33.0,7.775,0.496798,0.105782,33.0,7.775
666,25.0,13.0,25.0,13.0,0.374809,0.176871,25.0,13.0
30,40.0,27.7208,40.0,27.7208,0.603538,0.377154,40.0,27.7208
287,22.0,7.8958,22.0,7.8958,0.329064,0.107426,22.0,7.8958
217,42.0,27.0,42.0,27.0,0.634035,0.367347,42.0,27.0
797,31.0,8.6833,31.0,8.6833,0.466301,0.11814,31.0,8.6833
371,18.0,6.4958,18.0,6.4958,0.26807,0.088378,18.0,6.4958


In [14]:
df_compara_knn.sort_values('age_original')  # para ver las imputaciones de Age

Unnamed: 0,age_original,fare_original,age_nan,fare_nan,age_imputado_scale,fare_imputado_scale,age_imputado,fare_imputado
803,0.42,8.5167,0.42,8.5167,0.000000,0.115873,0.42,8.5167
755,0.67,14.5000,0.67,14.5000,0.003812,0.197279,0.67,14.5000
644,0.75,19.2583,0.75,19.2583,0.005032,0.262018,0.75,19.2583
469,0.75,19.2583,0.75,19.2583,0.005032,0.262018,0.75,19.2583
831,0.83,18.7500,0.83,18.7500,0.006252,0.255102,0.83,18.7500
...,...,...,...,...,...,...,...,...
116,70.50,7.7500,,7.7500,0.609637,0.105442,40.40,7.7500
493,71.00,49.5042,,49.5042,0.554742,0.673527,36.80,49.5042
96,71.00,34.6542,,34.6542,0.554742,0.471486,36.80,34.6542
851,74.00,7.7750,,7.7750,0.304666,0.105782,20.40,7.7750
