<a href="https://colab.research.google.com/github/Dexduo/Imersao-em-Ciencia-de-Dados/blob/main/Notebooks/Pre_Processamento_de_Dados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Lidando com dados faltantes**

In [2]:
import pandas as pd

In [7]:
df = pd.read_csv('/content/teste.CSV')
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [8]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

## **Eliminando padrões ou colunas com valores faltantes**

In [12]:
df.dropna(axis=0) # irá remover toda linha que tenha algum valor nulo.

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [13]:
df.dropna(axis=1) # irá remover toda coluna que tenha algum valor nulo.

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [15]:
df.dropna(how='all') # só irá remover a coluna se todos os valores estiverem faltando.

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [17]:
df.dropna(thresh=4) # irá remover as linha que tiverem menos que a quantidade do parametro de números reais.

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [18]:
df.dropna(subset=['C']) # se tiver algum valor faltante na coluna do parametro, a linha correspondente será removida.

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


## **"*Inputando*" valores faltantes** **negrito**

In [20]:
from sklearn.impute import SimpleImputer
import numpy as np

In [23]:
imr = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imr = imr.fit(df.values)

imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [24]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


## **Lidando com dados categóricos**

In [44]:
df = pd.read_csv('/content/categorical.csv')
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [45]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [46]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

## **Mapeando *features* ordinais**

In [47]:
import numpy as np

In [48]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [49]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [54]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [55]:
from sklearn.preprocessing import LabelEncoder

In [56]:
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [57]:
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

## **One-hot encoding**

In [58]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [59]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [60]:
from sklearn.preprocessing import OneHotEncoder

In [61]:
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [62]:
pd.get_dummies(df[['color', 'size', 'price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0
