In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

# Datos nulos o vacíos

In [3]:
# None y np.nan son reconocidos por pandas como valores nulos
string_data = pd.Series([None, 'artichoke', np.nan, 'advocado'])
string_data
string_data.isnull() # Máscara booleana ¿es nulo?

0         None
1    artichoke
2          NaN
3     advocado
dtype: object

0     True
1    False
2     True
3    False
dtype: bool

## Dropping

In [5]:
from numpy import nan as NA # Por practicidad

data = pd.Series([1, NA, 3.5, NA, 7])

# Son equivalentes. Eliminando elementos en un Series
data.dropna()
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
# Eliminando elementos en un DataFrame
data = pd.DataFrame([[1., 6.5, 3.],[1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() # Elimina filas que tengan al menos un valor NaN

data
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [None]:
# Podemos especificar que se eliminen las filas cuyos valores sean todos NaN
data.dropna(how='all')

In [7]:
# También se puede eliminar columnas (axis=1)
data[4] = NA
data
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
# Es posible especificar un rango de tolerancia para valores NaN
# (es decir, eliminar filas cuando tengan n o más valores NaN)
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA # Agregamos unos cuantos valores nulos...
df.iloc[:2, 2] = NA
df

# Eliminamos
df.dropna()
df.dropna(thresh=2)

Unnamed: 0,0,1,2
0,2.52663,,
1,-1.245554,,
2,0.479858,,-1.567112
3,1.062715,,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


Unnamed: 0,0,1,2
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


Unnamed: 0,0,1,2
2,0.479858,,-1.567112
3,1.062715,,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


## Filling

In [9]:
# Rellenar valores nulos en vez de eliminarlos
df.fillna(0)

Unnamed: 0,0,1,2
0,2.52663,0.0,0.0
1,-1.245554,0.0,0.0
2,0.479858,0.0,-1.567112
3,1.062715,0.0,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


In [10]:
# Valores de relleno específicos para cada columna
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,2.52663,0.5,0.0
1,-1.245554,0.5,0.0
2,0.479858,0.5,-1.567112
3,1.062715,0.5,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


In [11]:
# fillna siempre regresa un obj nuevo. Hagamos modificaciones al obj original
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,2.52663,0.0,0.0
1,-1.245554,0.0,0.0
2,0.479858,0.0,-1.567112
3,1.062715,0.0,-0.227244
4,-0.038111,0.064797,0.005509
5,-0.140743,-1.636005,-0.500649
6,1.084303,-0.694526,-0.967199


In [13]:
# Especificar métodos de relleno
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

df.fillna(method='ffill') # Copia el anterior valor no nulo a los valores nulos
df.fillna(method='ffill', limit=2) # Lo mismo, pero sólo a los primeros dos valores no nulos

Unnamed: 0,0,1,2
0,0.81288,-1.162239,1.877579
1,-0.010444,-0.800268,-1.278524
2,0.687791,,-1.103445
3,-2.164964,,-1.189242
4,1.997352,,
5,1.554005,,


Unnamed: 0,0,1,2
0,0.81288,-1.162239,1.877579
1,-0.010444,-0.800268,-1.278524
2,0.687791,-0.800268,-1.103445
3,-2.164964,-0.800268,-1.189242
4,1.997352,-0.800268,-1.189242
5,1.554005,-0.800268,-1.189242


Unnamed: 0,0,1,2
0,0.81288,-1.162239,1.877579
1,-0.010444,-0.800268,-1.278524
2,0.687791,-0.800268,-1.103445
3,-2.164964,-0.800268,-1.189242
4,1.997352,,-1.189242
5,1.554005,,-1.189242


In [14]:
# Importante: Rellenar de forma creativa (con la media, mediana, moda, etc)
data = pd.Series([1. ,NA, 3.5, NA, 7])
data.fillna(data.mean()) # Con la media/promedio

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# Transformaciones

## Removiendo duplicados

## Mapeo o aplicación de funciones

## Reemplazo

## Renombrando índices de ejes

# Lidiando con atributos categóricos

## One Hot Encoding