# Chapter 4 - Building Good Training Datasets – Data Preprocessing

## Dealing with missing data



### Identifying missing values in tabular data

In [1]:
## Vamos criar um dataset contendo
## valores nulos.

import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''


df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [2]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [3]:
## Acessar array do dataframe
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

### Eliminating training examples or features with missing values


In [4]:
## Eliminar linhas com 
## dados faltantes.

df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
## Eliminar colunas com 
## dados faltantes.

df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [6]:
## Deleta linha apenas se
## todos os valores da linha forem
## nulos.

csv_data = \
'''A,B,C,D
,,,
5.0,,,8.0
10.0,,12.0,'''


df1 = pd.read_csv(StringIO(csv_data))
df1


df1.dropna(how='all')

Unnamed: 0,A,B,C,D
1,5.0,,,8.0
2,10.0,,12.0,


In [16]:
## Elimina as linhas onde
## não há ao menos quatro elementos.

df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [22]:
## Apaga a linha, em caso
## de haver valores faltantes
## na coluna C.

df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


## Imputing missing values

In [28]:
from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)

imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [29]:
## Reproduzir o resultado na coluna
## anterior, usando pandas.

df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0
