In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

# Datos nulos o vacíos

In [3]:
# None y np.nan son reconocidos por pandas como valores nulos
string_data = pd.Series([None, 'artichoke', np.nan, 'advocado'])
string_data
string_data.isnull() # Máscara booleana ¿es nulo?

0         None
1    artichoke
2          NaN
3     advocado
dtype: object

0     True
1    False
2     True
3    False
dtype: bool

## Dropping

In [5]:
from numpy import nan as NA # Por practicidad

data = pd.Series([1, NA, 3.5, NA, 7])

# Son equivalentes. Eliminando elementos en un Series
data.dropna()
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
# Eliminando elementos en un DataFrame
data = pd.DataFrame([[1., 6.5, 3.],[1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() # Elimina filas que tengan al menos un valor NaN

data
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [None]:
# Podemos especificar que se eliminen las filas cuyos valores sean todos NaN
data.dropna(how='all')

In [7]:
# También se puede eliminar columnas (axis=1)
data[4] = NA
data
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
# Es posible especificar un rango de tolerancia para valores NaN
# (es decir, eliminar filas cuando tengan n o más valores NaN)
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA # Agregamos unos cuantos valores nulos...
df.iloc[:2, 2] = NA
df

# Eliminamos
df.dropna()
df.dropna(thresh=2)

Unnamed: 0,0,1,2
0,2.52663,,
1,-1.245554,,
2,0.479858,,-1.567112
3,1.062715,,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


Unnamed: 0,0,1,2
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


Unnamed: 0,0,1,2
2,0.479858,,-1.567112
3,1.062715,,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


## Filling

In [9]:
# Rellenar valores nulos en vez de eliminarlos
df.fillna(0)

Unnamed: 0,0,1,2
0,2.52663,0.0,0.0
1,-1.245554,0.0,0.0
2,0.479858,0.0,-1.567112
3,1.062715,0.0,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


In [10]:
# Valores de relleno específicos para cada columna
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,2.52663,0.5,0.0
1,-1.245554,0.5,0.0
2,0.479858,0.5,-1.567112
3,1.062715,0.5,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


In [11]:
# fillna siempre regresa un obj nuevo. Hagamos modificaciones al obj original
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,2.52663,0.0,0.0
1,-1.245554,0.0,0.0
2,0.479858,0.0,-1.567112
3,1.062715,0.0,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


In [13]:
# Especificar métodos de relleno
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

df.fillna(method='ffill') # Copia el anterior valor no nulo a los valores nulos
df.fillna(method='ffill', limit=2) # Lo mismo, pero sólo a los primeros dos valores no nulos

Unnamed: 0,0,1,2
0,0.81288,-1.162239,1.877579
1,-0.010444,-0.800268,-1.278524
2,0.687791,,-1.103445
3,-2.164964,,-1.189242
4,1.997352,,
5,1.554005,,


Unnamed: 0,0,1,2
0,0.81288,-1.162239,1.877579
1,-0.010444,-0.800268,-1.278524
2,0.687791,-0.800268,-1.103445
3,-2.164964,-0.800268,-1.189242
4,1.997352,-0.800268,-1.189242
5,1.554005,-0.800268,-1.189242


Unnamed: 0,0,1,2
0,0.81288,-1.162239,1.877579
1,-0.010444,-0.800268,-1.278524
2,0.687791,-0.800268,-1.103445
3,-2.164964,-0.800268,-1.189242
4,1.997352,,-1.189242
5,1.554005,,-1.189242


In [14]:
# Importante: Rellenar de forma creativa (con la media, mediana, moda, etc)
data = pd.Series([1. ,NA, 3.5, NA, 7])
data.fillna(data.mean()) # Con la media/promedio

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# Transformaciones

## Removiendo duplicados

In [18]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                    'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [19]:
# Funciones para tratar con duplicados
data.duplicated() # Indica si es un duplicado o no
data.drop_duplicates() # Elimina duplicados

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [20]:
# Eliminar basado en una sólo columna
data['v1'] = range(7)
data
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


## Mapeo o aplicación de funciones

In [21]:
# Usemos los valores de una columna para decidir qué valores agregar
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'Pastrami', 'corned beef', 'Bacon',
                             'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [22]:
# Mapa de conversión
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [23]:
# Estandarizamos el input
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [25]:
# Aplicamos el mapeado a la nueva columna 'animal'
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [26]:
# ¡Esto podemos hacerlo en una sola línea!
data['food'].map(lambda x: meat_to_animal[x.lower()]) # Asignamos esto a una columna

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

## Reemplazo

In [28]:
data = pd.Series([1., -999, 2., -999, -1000, 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [30]:
# Reemplazo estándar (consideramos que -999 es un outlier)
data.replace(-999, np.nan)
# Reemplazo múltiple
data.replace([-999, -1000], np.nan)
# Valores de reemplazo basados en los valores considerados nulos
data.replace([-999, -1000], [np.nan, 0]) # Por Lista
data.replace({-999: np.nan, -1000: 0}) # Por Diccionario

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## Detectando y Filtrando Outliers

In [40]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.01864,0.007664,0.03075,0.015872
std,0.987666,1.006687,0.968746,0.967084
min,-3.391541,-3.05774,-2.967053,-3.42367
25%,-0.623282,-0.67501,-0.631652,-0.671377
50%,0.012595,0.006456,0.051739,0.034311
75%,0.71032,0.679429,0.701487,0.667633
max,3.367593,3.336998,3.181069,2.705749


In [47]:
# Seleccionamos las columnas que tengan al menos un valor fuera de [-3, 3] 

np.abs(data) > 3 # Indica caldas con valores fuera del rango
(np.abs(data) > 3).any(1) # Indica filas con valores fuera del rango (1 es True)
data[(np.abs(data)>3).any(1)] # Selecciona esas filas

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,True,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


0      False
1      False
2      False
3      False
4      False
       ...  
995     True
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

Unnamed: 0,0,1,2,3
10,3.316858,0.887786,1.159457,0.661911
39,-3.391541,-2.671317,-0.903599,0.351872
295,0.7159,0.569964,-2.198252,-3.164736
362,-2.382441,-0.727567,3.181069,0.975532
416,1.957634,3.336998,-1.546276,0.201048
620,0.00255,-3.05774,0.545851,-0.882427
705,3.367593,-1.178893,-0.241417,1.171733
808,-0.536735,-0.383568,1.802397,-3.42367
995,3.051939,-0.374209,0.012025,-0.487104


In [48]:
# Cortamos datos en el rango [-3, 3]. De esta manera mitigamos los outliers
data[np.abs(data)>3] = np.sign(data) * 3 # sign toma los signos de los valores
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.018295,0.007385,0.030569,0.01646
std,0.984038,1.005453,0.968173,0.965138
min,-3.0,-3.0,-2.967053,-3.0
25%,-0.623282,-0.67501,-0.631652,-0.671377
50%,0.012595,0.006456,0.051739,0.034311
75%,0.71032,0.679429,0.701487,0.667633
max,3.0,3.0,3.0,2.705749


# Lidiando con atributos categóricos

## One Hot Encoding