## Análisis de datos - Clase 5

### Técnicas avanzadas para el tratamiento de outliers

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer

In [2]:
# Cargar el dataset de Titanic
df_titanic = sns.load_dataset("titanic")

# Seleccionar variables numéricas relevantes
df = df_titanic[['age', 'fare']].dropna()
df.head(10)

Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05
6,54.0,51.8625
7,2.0,21.075
8,27.0,11.1333
9,14.0,30.0708
10,4.0,16.7


In [3]:
df.describe()

Unnamed: 0,age,fare
count,714.0,714.0
mean,29.699118,34.694514
std,14.526497,52.91893
min,0.42,0.0
25%,20.125,8.05
50%,28.0,15.7417
75%,38.0,33.375
max,80.0,512.3292


### Detectamos outliers en base al rango intercuartil

Un dato se considera outlier si es < (Q1 - 1.5 * IQR)) o > (Q3 + 1.5 * IQR)

In [4]:
# Métodos estadísticos para detectar outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)

IQR = Q3 - Q1
outliers_iqr = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
print(f"Outliers\nAge: <{Q1['age'] - 1.5 * IQR['age']:.02f} o >{Q3['age'] + 1.5 * IQR['age']:.02f}")
print(f"Fare: <{Q1['fare'] - 1.5 * IQR['fare']:.02f} o >{Q3['fare'] + 1.5 * IQR['fare']:.02f}")

Outliers
Age: <-6.69 o >64.81
Fare: <-29.94 o >71.36


### Estrategias de eliminación y tratamiento de outliers

#### 3. Imputación

#### Imputación con KNN

In [5]:
imputer = KNNImputer(n_neighbors=5)
df_nan = df.copy()

df_nan[outliers_iqr] = np.nan  # marca outliers como NaN

df_knn_imputed = pd.DataFrame(imputer.fit_transform(df_nan),
                              columns=df.columns,
                              index=df_nan.index # sin esto, se pierde el índice original
                             )

df_knn_imputed.describe()

Unnamed: 0,age,fare
count,714.0,714.0
mean,29.168305,19.606931
std,13.683456,13.939995
min,0.42,0.0
25%,20.125,8.05
50%,28.0,14.4542
75%,38.0,26.2875
max,64.0,71.2833


In [6]:
# comparo las observaciones originales con las imputaciones
df_compara_knn = pd.concat([df, df_nan, df_knn_imputed], axis=1)

df_compara_knn.columns = ['age_original', 'fare_original', 'age_nan', 'fare_nan', 'age_imputado', 'fare_imputado']

df_compara_knn.head(30) # se observan imputaciones de Fare

Unnamed: 0,age_original,fare_original,age_nan,fare_nan,age_imputado,fare_imputado
0,22.0,7.25,22.0,7.25,22.0,7.25
1,38.0,71.2833,38.0,71.2833,38.0,71.2833
2,26.0,7.925,26.0,7.925,26.0,7.925
3,35.0,53.1,35.0,53.1,35.0,53.1
4,35.0,8.05,35.0,8.05,35.0,8.05
6,54.0,51.8625,54.0,51.8625,54.0,51.8625
7,2.0,21.075,2.0,21.075,2.0,21.075
8,27.0,11.1333,27.0,11.1333,27.0,11.1333
9,14.0,30.0708,14.0,30.0708,14.0,30.0708
10,4.0,16.7,4.0,16.7,4.0,16.7


In [7]:
df_compara_knn.sort_values('age_original')  # para ver las imputaciones de Age

Unnamed: 0,age_original,fare_original,age_nan,fare_nan,age_imputado,fare_imputado
803,0.42,8.5167,0.42,8.5167,0.42,8.5167
755,0.67,14.5000,0.67,14.5000,0.67,14.5000
644,0.75,19.2583,0.75,19.2583,0.75,19.2583
469,0.75,19.2583,0.75,19.2583,0.75,19.2583
78,0.83,29.0000,0.83,29.0000,0.83,29.0000
...,...,...,...,...,...,...
116,70.50,7.7500,,7.7500,32.20,7.7500
493,71.00,49.5042,,49.5042,36.80,49.5042
96,71.00,34.6542,,34.6542,28.20,34.6542
851,74.00,7.7750,,7.7750,24.60,7.7750
