In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

# Datos nulos o vacíos

In [3]:
# None y np.nan son reconocidos por pandas como valores nulos
string_data = pd.Series([None, 'artichoke', np.nan, 'advocado'])
string_data
string_data.isnull() # Máscara booleana ¿es nulo?

0         None
1    artichoke
2          NaN
3     advocado
dtype: object

0     True
1    False
2     True
3    False
dtype: bool

## Dropping

In [4]:
from numpy import nan as NA # Por practicidad

data = pd.Series([1, NA, 3.5, NA, 7])

# Son equivalentes. Eliminando elementos en un Series
data.dropna()
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

0    1.0
2    3.5
4    7.0
dtype: float64

In [5]:
# Eliminando elementos en un DataFrame
data = pd.DataFrame([[1., 6.5, 3.],[1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() # Elimina filas que tengan al menos un valor NaN

data
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [6]:
# Podemos especificar que se eliminen las filas cuyos valores sean todos NaN
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [7]:
# También se puede eliminar columnas (axis=1)
data[4] = NA
data
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
# Es posible especificar un rango de tolerancia para valores NaN
# (es decir, eliminar filas cuando tengan n o más valores NaN)
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA # Agregamos unos cuantos valores nulos...
df.iloc[:2, 2] = NA
df

# Eliminamos
df.dropna()
df.dropna(thresh=2)

Unnamed: 0,0,1,2
0,0.223775,,
1,-1.539905,,
2,-0.465759,,0.7551
3,-2.029384,,1.653952
4,-0.635858,0.833078,-1.536475
5,0.111996,1.159778,-1.422962
6,0.091699,-0.071551,-1.469475


Unnamed: 0,0,1,2
4,-0.635858,0.833078,-1.536475
5,0.111996,1.159778,-1.422962
6,0.091699,-0.071551,-1.469475


Unnamed: 0,0,1,2
2,-0.465759,,0.7551
3,-2.029384,,1.653952
4,-0.635858,0.833078,-1.536475
5,0.111996,1.159778,-1.422962
6,0.091699,-0.071551,-1.469475


## Filling

In [9]:
# Rellenar valores nulos en vez de eliminarlos
df.fillna(0)

Unnamed: 0,0,1,2
0,0.223775,0.0,0.0
1,-1.539905,0.0,0.0
2,-0.465759,0.0,0.7551
3,-2.029384,0.0,1.653952
4,-0.635858,0.833078,-1.536475
5,0.111996,1.159778,-1.422962
6,0.091699,-0.071551,-1.469475


In [10]:
# Valores de relleno específicos para cada columna
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.223775,0.5,0.0
1,-1.539905,0.5,0.0
2,-0.465759,0.5,0.7551
3,-2.029384,0.5,1.653952
4,-0.635858,0.833078,-1.536475
5,0.111996,1.159778,-1.422962
6,0.091699,-0.071551,-1.469475


In [11]:
# fillna siempre regresa un obj nuevo. Hagamos modificaciones al obj original
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.223775,0.0,0.0
1,-1.539905,0.0,0.0
2,-0.465759,0.0,0.7551
3,-2.029384,0.0,1.653952
4,-0.635858,0.833078,-1.536475
5,0.111996,1.159778,-1.422962
6,0.091699,-0.071551,-1.469475


In [12]:
# Especificar métodos de relleno
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

df.fillna(method='ffill') # Copia el anterior valor no nulo a los valores nulos
df.fillna(method='ffill', limit=2) # Lo mismo, pero sólo a los primeros dos valores no nulos

Unnamed: 0,0,1,2
0,0.501,1.522149,1.285217
1,-0.237764,-0.541835,0.401977
2,-1.392302,,1.463418
3,-1.513325,,1.696089
4,2.166418,,
5,-2.428624,,


Unnamed: 0,0,1,2
0,0.501,1.522149,1.285217
1,-0.237764,-0.541835,0.401977
2,-1.392302,-0.541835,1.463418
3,-1.513325,-0.541835,1.696089
4,2.166418,-0.541835,1.696089
5,-2.428624,-0.541835,1.696089


Unnamed: 0,0,1,2
0,0.501,1.522149,1.285217
1,-0.237764,-0.541835,0.401977
2,-1.392302,-0.541835,1.463418
3,-1.513325,-0.541835,1.696089
4,2.166418,,1.696089
5,-2.428624,,1.696089


In [13]:
# Importante: Rellenar de forma creativa (con la media, mediana, moda, etc)
data = pd.Series([1. ,NA, 3.5, NA, 7])
data.fillna(data.mean()) # Con la media/promedio

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# Transformaciones

## Removiendo duplicados

In [14]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                    'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [15]:
# Funciones para tratar con duplicados
data.duplicated() # Indica si es un duplicado o no
data.drop_duplicates() # Elimina duplicados

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [16]:
# Eliminar basado en una sólo columna
data['v1'] = range(7)
data
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


## Mapeo o aplicación de funciones

In [17]:
# Usemos los valores de una columna para decidir qué valores agregar
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'Pastrami', 'corned beef', 'Bacon',
                             'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [18]:
# Mapa de conversión
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [19]:
# Estandarizamos el input
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [20]:
# Aplicamos el mapeado a la nueva columna 'animal'
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [21]:
# ¡Esto podemos hacerlo en una sola línea!
data['food'].map(lambda x: meat_to_animal[x.lower()]) # Asignamos esto a una columna

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

## Reemplazo

In [22]:
data = pd.Series([1., -999, 2., -999, -1000, 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [23]:
# Reemplazo estándar (consideramos que -999 es un outlier)
data.replace(-999, np.nan)
# Reemplazo múltiple
data.replace([-999, -1000], np.nan)
# Valores de reemplazo basados en los valores considerados nulos
data.replace([-999, -1000], [np.nan, 0]) # Por Lista
data.replace({-999: np.nan, -1000: 0}) # Por Diccionario

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## Detectando y Filtrando Outliers

In [24]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.030187,-0.04274,0.026153,0.021289
std,1.02825,0.945414,1.016455,1.00668
min,-2.872714,-2.951274,-3.494109,-3.33685
25%,-0.666329,-0.744675,-0.633581,-0.670821
50%,0.044648,-0.009294,0.012257,0.013161
75%,0.73725,0.615389,0.67273,0.699884
max,3.751839,2.856424,3.050829,3.234423


In [25]:
# Seleccionamos las columnas que tengan al menos un valor fuera de [-3, 3] 

np.abs(data) > 3 # Indica caldas con valores fuera del rango
(np.abs(data) > 3).any(1) # Indica filas con valores fuera del rango (1 es True)
data[(np.abs(data)>3).any(1)] # Selecciona esas filas

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

Unnamed: 0,0,1,2,3
53,-1.214796,0.242562,-1.112239,-3.023339
172,-0.504874,-0.324471,-3.494109,-0.677867
470,0.11708,-0.895879,-3.192702,0.533929
478,-0.964123,-0.480623,0.709658,-3.178859
505,0.195308,-0.240443,0.56133,3.234423
527,3.127758,-0.061255,-1.505391,0.417659
560,-0.646045,-1.385536,3.050829,1.352743
657,-0.512358,0.477491,-0.455459,-3.33685
790,1.141617,-0.359831,-0.093765,-3.185131
884,3.751839,1.143279,-0.269726,-1.053186


In [26]:
# Cortamos datos en el rango [-3, 3]. De esta manera mitigamos los outliers
data[np.abs(data)>3] = np.sign(data) * 3 # sign toma los signos de los valores
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.029104,-0.04274,0.026789,0.021779
std,1.024809,0.945414,1.014117,1.003689
min,-2.872714,-2.951274,-3.0,-3.0
25%,-0.666329,-0.744675,-0.633581,-0.670821
50%,0.044648,-0.009294,0.012257,0.013161
75%,0.73725,0.615389,0.67273,0.699884
max,3.0,2.856424,3.0,3.0


# Lidiando con atributos categóricos

## One Hot Encoding

In [34]:
# Catagorías --> Matriz de indicadores
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})
df
pd.get_dummies(df['key'])

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [35]:
# Creamos columnas con prefijos (para no confundirlas)
dummies = pd.get_dummies(df['key'], prefix='key')

# Unimos la matriz binaria con los datos originales
df_with_dummy = df[['data1']].join(dummies)

df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [36]:
# Juguemos con un dataset
mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('../datasets/movielens/movies.dat', sep='::',
                      header=None, names=mnames, engine='python')
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [37]:
# Hagamos un OneHotEncoding para géneros de cine
# 1) Extraer la lista de las categorías disponibles en la columna
all_genres = []

for x in movies.genres:
    all_genres.extend(x.split('|'))
    
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [38]:
# 2) Crear matriz de ceros específica para alberga géneros
zero_matrix = np.zeros((len(movies), len(genres)))

dummies = pd.DataFrame(zero_matrix, columns=genres)

In [39]:
# 3) Iterar sobre cada peĺícula y encontrar sus géneros
gen = movies.genres[0]
gen.split('|')

['Animation', "Children's", 'Comedy']

In [40]:
# (Pausa) get_indexer regresa los índices de las columnas con los nombres especificados
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2])

In [41]:
# 4) Poner un 1 a las columnas a cuyos géneros pertenece las películas
for i, gen  in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [43]:
# 5) Combina el OneHotEncoding con los datos originales
movies_windic = movies.join(dummies.add_prefix('Genre')) # Añadimos un prefijo para evitar confusiones

In [44]:
# Revisemos los datos para la primera película
movies_windic.iloc[0]

movie_id                                      1
title                          Toy Story (1995)
genres              Animation|Children's|Comedy
GenreAnimation                                1
GenreChildren's                               1
GenreComedy                                   1
GenreAdventure                                0
GenreFantasy                                  0
GenreRomance                                  0
GenreDrama                                    0
GenreAction                                   0
GenreCrime                                    0
GenreThriller                                 0
GenreHorror                                   0
GenreSci-Fi                                   0
GenreDocumentary                              0
GenreWar                                      0
GenreMusical                                  0
GenreMystery                                  0
GenreFilm-Noir                                0
GenreWestern                            

Nota: Los OneHotEnconding deberían crearse en Numpy para mejorar el rendimiento. Después pueden guardarse en un DataFrame