In [1]:
#medium article: https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d?gi=c5101f25cd16

In [60]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

In [61]:
dataset = pd.read_csv('fake_data_2.csv')

In [62]:
dataset.head()

Unnamed: 0,cargo,idade,salario,bonus,sócio
0,Diretor,45,24000.0,10000.0,sim
1,Analista,22,8000.0,2000.0,não
2,Programador,30,,1000.0,não
3,Gerente,24,15100.0,,não
4,Gerente,30,35000.0,6000.0,sim


In [63]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [64]:
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) #This will handle missing data with the mean of the column



In [65]:
X[:, 2:4]

array([[24000.0, 10000.0],
       [8000.0, 2000.0],
       [nan, 1000.0],
       [15100.0, nan],
       [35000.0, 6000.0],
       [5300.0, 2000.0],
       [nan, 1200.0],
       [18000.0, 8000.0],
       [38000.0, 28000.0],
       [7300.0, 4000.0],
       [2344.0, nan],
       [4500.0, 2200.0],
       [30000.0, 12000.0],
       [14000.0, 10000.0]], dtype=object)

In [66]:
imputer = imputer.fit(X[:, 2:4]) #put only the columns with possible missing data

In [67]:
X[:, 2:4] = imputer.transform(X[:, 2:4]) #put only the columns with possible missing data

In [68]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [69]:
X_cargo = pd.DataFrame({'cargo':X[:,0]})

In [70]:
X_cargo

Unnamed: 0,cargo
0,Diretor
1,Analista
2,Programador
3,Gerente
4,Gerente
5,Programador
6,Analista
7,Diretor
8,Fundador
9,Analista


In [71]:
X_cargo = pd.get_dummies(X_cargo)

In [72]:
X_cargo.head()

Unnamed: 0,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,1,0


In [73]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [74]:
X = X[:, 1:]

In [75]:
X

array([[45, 24000.0, 10000.0],
       [22, 8000.0, 2000.0],
       [30, 16795.333333333332, 1000.0],
       [24, 15100.0, 7200.0],
       [30, 35000.0, 6000.0],
       [22, 5300.0, 2000.0],
       [20, 16795.333333333332, 1200.0],
       [50, 18000.0, 8000.0],
       [65, 38000.0, 28000.0],
       [32, 7300.0, 4000.0],
       [35, 2344.0, 7200.0],
       [28, 4500.0, 2200.0],
       [28, 30000.0, 12000.0],
       [30, 14000.0, 10000.0]], dtype=object)

In [76]:
X = pd.DataFrame({'idade':X[:,0], 'salario':X[:,1], 'bonus':X[:,2]})

In [77]:
X.head()

Unnamed: 0,idade,salario,bonus
0,45,24000.0,10000
1,22,8000.0,2000
2,30,16795.3,1000
3,24,15100.0,7200
4,30,35000.0,6000


In [78]:
X = X.join(X_cargo)

In [79]:
X

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,45,24000.0,10000,0,1,0,0,0
1,22,8000.0,2000,1,0,0,0,0
2,30,16795.3,1000,0,0,0,0,1
3,24,15100.0,7200,0,0,0,1,0
4,30,35000.0,6000,0,0,0,1,0
5,22,5300.0,2000,0,0,0,0,1
6,20,16795.3,1200,1,0,0,0,0
7,50,18000.0,8000,0,1,0,0,0
8,65,38000.0,28000,0,0,1,0,0
9,32,7300.0,4000,1,0,0,0,0


In [20]:
Y

array(['sim', 'não', 'não', 'não', 'sim', 'não', 'não', 'sim', 'sim',
       'não', 'não', 'não', 'sim', 'sim'], dtype=object)

In [21]:
Y = pd.get_dummies(Y)

In [22]:
Y

Unnamed: 0,não,sim
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
5,1,0
6,1,0
7,0,1
8,0,1
9,1,0


In [23]:
Y = Y['sim'].values

In [24]:
Y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=uint8)

In [80]:
X_normalize = X.copy()
X_normalize[['idade', 'salario', 'bonus']] = Normalizer().fit_transform(X[['idade', 'salario', 'bonus']])
X_normalize.head()

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.001731,0.923076,0.384615,0,1,0,0,0
1,0.002668,0.970139,0.242535,1,0,0,0,0
2,0.001783,0.998231,0.059435,0,0,0,0,1
3,0.001435,0.902638,0.430397,0,0,0,1,0
4,0.000845,0.985622,0.168964,0,0,0,1,0


In [81]:
X_minMax = X.copy()
X_minMax[['idade', 'salario', 'bonus']] = MinMaxScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_minMax.head()

  return self.partial_fit(X, y)


Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.555556,0.607359,0.333333,0,1,0,0,0
1,0.044444,0.158627,0.037037,1,0,0,0,0
2,0.222222,0.405299,0.0,0,0,0,0,1
3,0.088889,0.357752,0.22963,0,0,0,1,0
4,0.222222,0.915863,0.185185,0,0,0,1,0


In [82]:
X_standard = X.copy()
X_standard[['idade', 'salario', 'bonus']] = StandardScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_standard.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,1.002248,0.657862,0.415034,0,1,0,0,0
1,-0.907361,-0.803107,-0.770778,1,0,0,0,0
2,-0.243149,0.0,-0.919005,0,0,0,0,1
3,-0.741308,-0.154802,0.0,0,0,0,1,0
4,-0.243149,1.662279,-0.177872,0,0,0,1,0


In [83]:
X_robust = X.copy()
X_robust[['idade', 'salario', 'bonus']] = RobustScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_robust.head()

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,1.621622,0.535929,0.456376,0,1,0,0,0
1,-0.864865,-0.528963,-0.61745,1,0,0,0,0
2,0.0,0.056417,-0.751678,0,0,0,0,1
3,-0.648649,-0.056417,0.080537,0,0,0,1,0
4,0.0,1.268042,-0.080537,0,0,0,1,0


In [84]:
X_quantile = X.copy()
X_quantile[['idade', 'salario', 'bonus']] = QuantileTransformer().fit_transform(X[['idade', 'salario', 'bonus']])
X_quantile.head()

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.846317,0.769231,0.8078078,0,1,0,0,0
1,0.115115,0.307375,0.1921922,1,0,0,0,0
2,0.538539,0.576577,1e-07,0,0,0,0,1
3,0.230599,0.461513,0.5765766,0,0,0,1,0
4,0.538539,0.923156,0.4615835,0,0,0,1,0


In [85]:
X_power = X.copy()
X_power[['idade', 'salario', 'bonus']] = PowerTransformer().fit_transform(X[['idade', 'salario', 'bonus']])
X_power.head()

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,1.14323,0.768685,0.775384,0,1,0,0,0
1,-1.075613,-0.711676,-0.943097,1,0,0,0,0
2,-0.120103,0.22211,-1.639884,0,0,0,0,1
3,-0.808699,0.072326,0.412848,0,0,0,1,0
4,-0.120103,1.428193,0.214291,0,0,0,1,0


In [33]:
X_new = SelectKBest(chi2, k=6).fit_transform(X_minMax, Y)

In [34]:
X_new

array([[0.55555556, 0.60735921, 0.33333333, 0.        , 1.        ,
        0.        ],
       [0.04444444, 0.15862688, 0.03703704, 1.        , 0.        ,
        0.        ],
       [0.22222222, 0.40529878, 0.        , 0.        , 0.        ,
        0.        ],
       [0.08888889, 0.35775185, 0.22962963, 0.        , 0.        ,
        0.        ],
       [0.22222222, 0.91586269, 0.18518519, 0.        , 0.        ,
        0.        ],
       [0.04444444, 0.0829033 , 0.03703704, 0.        , 0.        ,
        0.        ],
       [0.        , 0.40529878, 0.00740741, 1.        , 0.        ,
        0.        ],
       [0.66666667, 0.43908459, 0.25925926, 0.        , 1.        ,
        0.        ],
       [1.        , 1.        , 1.        , 0.        , 0.        ,
        1.        ],
       [0.26666667, 0.13899484, 0.11111111, 1.        , 0.        ,
        0.        ],
       [0.33333333, 0.        , 0.22962963, 0.        , 0.        ,
        0.        ],
       [0.17777778, 0

In [35]:
clf = ExtraTreesClassifier()

In [36]:
clf = clf.fit(X_minMax, Y)



In [37]:
clf.feature_importances_ 

array([0.12823864, 0.24412879, 0.27216225, 0.07169613, 0.11600694,
       0.11073338, 0.02269676, 0.03433712])

In [38]:
model = SelectFromModel(clf, prefit=True, threshold="median")
X_new = model.transform(X_minMax)

In [39]:
X_new

array([[0.55555556, 0.60735921, 0.33333333, 1.        ],
       [0.04444444, 0.15862688, 0.03703704, 0.        ],
       [0.22222222, 0.40529878, 0.        , 0.        ],
       [0.08888889, 0.35775185, 0.22962963, 0.        ],
       [0.22222222, 0.91586269, 0.18518519, 0.        ],
       [0.04444444, 0.0829033 , 0.03703704, 0.        ],
       [0.        , 0.40529878, 0.00740741, 0.        ],
       [0.66666667, 0.43908459, 0.25925926, 1.        ],
       [1.        , 1.        , 1.        , 0.        ],
       [0.26666667, 0.13899484, 0.11111111, 0.        ],
       [0.33333333, 0.        , 0.22962963, 0.        ],
       [0.17777778, 0.06046668, 0.04444444, 0.        ],
       [0.17777778, 0.77563383, 0.40740741, 0.        ],
       [0.22222222, 0.3269015 , 0.33333333, 0.        ]])

In [86]:
pca = PCA(n_components='mle')
X_new = pca.fit_transform(X_minMax) 

In [88]:
X_new

array([[ 0.53866608, -0.13267348, -0.3357473 ,  0.79791559,  0.08466453,
        -0.05714549,  0.07021413],
       [-0.1368266 ,  0.93283828, -0.21270972, -0.16271694, -0.0028759 ,
         0.04533437,  0.01861439],
       [-0.60363558, -0.37644469,  0.01636196, -0.04850587,  0.07254725,
        -0.23334703, -0.09461641],
       [ 0.20186796,  0.18208778,  0.90457672,  0.15443852, -0.06257746,
         0.26362392,  0.03431419],
       [ 0.4695611 ,  0.08126605,  0.95831876,  0.12838664, -0.03621429,
        -0.22627592, -0.0374659 ],
       [-0.78274982, -0.29609104,  0.01031809, -0.04706371,  0.11946656,
         0.05719386,  0.01365582],
       [-0.0501463 ,  0.91612012, -0.16114226, -0.18467334,  0.08231014,
        -0.16703018,  0.03887834],
       [ 0.47351528, -0.11871599, -0.38245359,  0.83764906,  0.01517078,
         0.06756885, -0.06232372],
       [ 1.21379905, -0.62653624, -0.30107854, -0.5921759 , -0.31002738,
        -0.02640987, -0.0082365 ],
       [-0.0615711 ,  0.8632

In [89]:
X_train, X_test, Y_train, Y_test = train_test_split(X_minMax, Y, test_size = 0.2, random_state = 0)

In [90]:
X_train

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
11,0.177778,0.060467,0.044444,0,0,0,0,1
2,0.222222,0.405299,0.0,0,0,0,0,1
13,0.222222,0.326902,0.333333,0,0,0,0,1
9,0.266667,0.138995,0.111111,1,0,0,0,0
1,0.044444,0.158627,0.037037,1,0,0,0,0
7,0.666667,0.439085,0.259259,0,1,0,0,0
10,0.333333,0.0,0.22963,0,0,0,0,1
3,0.088889,0.357752,0.22963,0,0,0,1,0
0,0.555556,0.607359,0.333333,0,1,0,0,0
5,0.044444,0.082903,0.037037,0,0,0,0,1


In [91]:
X_test

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
8,1.0,1.0,1.0,0,0,1,0,0
6,0.0,0.405299,0.007407,1,0,0,0,0
4,0.222222,0.915863,0.185185,0,0,0,1,0


In [55]:
X_new = X_minMax.values

In [94]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
time = 1
for train_index, test_index in kf.split(X_new):
    print("K = " + str(time) +  " - TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_new[train_index], X_new[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    time += 1

K = 1 - TRAIN: [ 7  8  9 10 11 12 13] TEST: [0 1 2 3 4 5 6]
K = 2 - TRAIN: [0 1 2 3 4 5 6] TEST: [ 7  8  9 10 11 12 13]
