In [1]:
#medium article: https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d?gi=c5101f25cd16

In [133]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [134]:
dataset = pd.read_csv('fake_data_2.csv')

In [135]:
dataset.head(30)

Unnamed: 0,cargo,idade,salario,bonus,sócio
0,Diretor,45,24000.0,10000.0,sim
1,Analista,22,8000.0,2000.0,não
2,Programador,30,,1000.0,não
3,Gerente,24,15100.0,,não
4,Gerente,30,35000.0,6000.0,sim
5,Programador,22,5300.0,2000.0,não
6,Analista,20,,1200.0,não
7,Diretor,50,18000.0,8000.0,sim
8,Fundador,65,38000.0,28000.0,sim
9,Analista,32,7300.0,4000.0,não


In [136]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [137]:
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) #This will handle missing data with the mean of the column



In [138]:
X[:, 2:4]

array([[24000.0, 10000.0],
       [8000.0, 2000.0],
       [nan, 1000.0],
       [15100.0, nan],
       [35000.0, 6000.0],
       [5300.0, 2000.0],
       [nan, 1200.0],
       [18000.0, 8000.0],
       [38000.0, 28000.0],
       [7300.0, 4000.0],
       [2344.0, nan],
       [4500.0, 2200.0],
       [30000.0, 12000.0],
       [14000.0, 10000.0]], dtype=object)

In [139]:
imputer = imputer.fit(X[:, 2:4]) #put only the columns with possible missing data

In [140]:
X[:, 2:4] = imputer.transform(X[:, 2:4]) #put only the columns with possible missing data

In [141]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [142]:
labelencoder_X = LabelEncoder() #labelEnconder will transform categorical values in numbers
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [143]:
X

array([[1, 45, 24000.0, 10000.0],
       [0, 22, 8000.0, 2000.0],
       [4, 30, 16795.333333333332, 1000.0],
       [3, 24, 15100.0, 7200.0],
       [3, 30, 35000.0, 6000.0],
       [4, 22, 5300.0, 2000.0],
       [0, 20, 16795.333333333332, 1200.0],
       [1, 50, 18000.0, 8000.0],
       [2, 65, 38000.0, 28000.0],
       [0, 32, 7300.0, 4000.0],
       [4, 35, 2344.0, 7200.0],
       [4, 28, 4500.0, 2200.0],
       [2, 28, 30000.0, 12000.0],
       [4, 30, 14000.0, 10000.0]], dtype=object)

In [144]:
onehotencoder = OneHotEncoder(categorical_features = [0]) #this wiil transform this values in columns

In [145]:
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [109]:
X

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.50000000e+01, 2.40000000e+04, 1.00000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.20000000e+01, 8.00000000e+03, 2.00000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 3.00000000e+01, 1.67953333e+04, 1.00000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 2.40000000e+01, 1.51000000e+04, 7.20000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 3.00000000e+01, 3.50000000e+04, 6.00000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 2.20000000e+01, 5.30000000e+03, 2.00000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.00000000e+01, 1.67953333e+04, 1.

In [154]:
X_copy = pd.DataFrame({'cargo_diretor':X[:,0], 'cargo_analista':X[:,1], 'cargo_programador':X[:,2], 
                      'cargo_gerente':X[:,3], 'cargo_fundador':X[:,4], 'idade':X[:,5], 'salario':X[:,6], 
                       'bonus':X[:,7]})

In [155]:
labelencoder_y = LabelEncoder()
Y = labelencoder_y.fit_transform(Y) #lets go transform yes and no now

In [156]:
Y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1])

In [157]:
X_copy

Unnamed: 0,cargo_diretor,cargo_analista,cargo_programador,cargo_gerente,cargo_fundador,idade,salario,bonus
0,0.0,1.0,0.0,0.0,0.0,45.0,24000.0,10000.0
1,1.0,0.0,0.0,0.0,0.0,22.0,8000.0,2000.0
2,0.0,0.0,0.0,0.0,1.0,30.0,16795.333333,1000.0
3,0.0,0.0,0.0,1.0,0.0,24.0,15100.0,7200.0
4,0.0,0.0,0.0,1.0,0.0,30.0,35000.0,6000.0
5,0.0,0.0,0.0,0.0,1.0,22.0,5300.0,2000.0
6,1.0,0.0,0.0,0.0,0.0,20.0,16795.333333,1200.0
7,0.0,1.0,0.0,0.0,0.0,50.0,18000.0,8000.0
8,0.0,0.0,1.0,0.0,0.0,65.0,38000.0,28000.0
9,1.0,0.0,0.0,0.0,0.0,32.0,7300.0,4000.0
