### Preprocessamento

In [1]:
import numpy as np
import pandas as pd

##### Importando a base

In [2]:
df = pd.read_csv("data/breast-cancer-problemas.data", sep=',', header=None)
df.drop(0, axis='columns', inplace=True)

#### Simples exploração dos dados

In [3]:
df[:5]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,-,2,3,1,1,2
3,6,8,8,1,3,4,3,-,1,2
4,4,1,1,3,2,1,3,1,1,2


## Iniciando Preprocessamento

### Transformando todos os '-' em Nan

In [4]:
df.replace('-', np.nan, inplace=True)
df.replace('?', np.nan, inplace=True)

In [5]:
df[:5]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2.0,1,3,1.0,1,2
1,5,4,4,5,7.0,10,3,2.0,1,2
2,3,1,1,1,,2,3,1.0,1,2
3,6,8,8,1,3.0,4,3,,1,2
4,4,1,1,3,2.0,1,3,1.0,1,2


### Removendo Ids e transformando todos os atributos em numeros

In [6]:
df = df.apply(lambda x: pd.to_numeric(x, downcast='integer'))

In [7]:
df.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
count,694.0,692.0,689.0,691.0,698.0,683.0,699.0,689.0,696.0,699.0
mean,4.412104,3.135838,3.224964,2.804631,3.217765,3.544656,3.437768,2.87373,1.591954,2.689557
std,2.812603,3.048387,2.984001,2.853075,2.215408,3.643857,2.438364,3.056164,1.718337,0.951273
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,3.5,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [8]:
df.dtypes

1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7        int8
8     float64
9     float64
10       int8
dtype: object

### Transformando todo os valores missing

In [9]:
df_mean = df.fillna(df.mean())

df_mode = df.fillna(df.mode().squeeze())

df_dropna = df.dropna(axis='index', how='any')

## Salvando dataframes

In [10]:
df_mean.to_csv("preprocessamento/data_mean.data", sep=',', header=None)
df_mode.to_csv("preprocessamento/data_mode.data", sep=',', header=None)
df_dropna.to_csv("preprocessamento/data_dropna.data", sep=',', header=None)
