In [1]:
#medium article: https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d?gi=c5101f25cd16

In [139]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA

In [90]:
dataset = pd.read_csv('fake_data_2.csv')

In [91]:
dataset.head(30)

Unnamed: 0,cargo,idade,salario,bonus,sócio
0,Diretor,45,24000.0,10000.0,sim
1,Analista,22,8000.0,2000.0,não
2,Programador,30,,1000.0,não
3,Gerente,24,15100.0,,não
4,Gerente,30,35000.0,6000.0,sim
5,Programador,22,5300.0,2000.0,não
6,Analista,20,,1200.0,não
7,Diretor,50,18000.0,8000.0,sim
8,Fundador,65,38000.0,28000.0,sim
9,Analista,32,7300.0,4000.0,não


In [92]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [93]:
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) #This will handle missing data with the mean of the column



In [94]:
X[:, 2:4]

array([[24000.0, 10000.0],
       [8000.0, 2000.0],
       [nan, 1000.0],
       [15100.0, nan],
       [35000.0, 6000.0],
       [5300.0, 2000.0],
       [nan, 1200.0],
       [18000.0, 8000.0],
       [38000.0, 28000.0],
       [7300.0, 4000.0],
       [2344.0, nan],
       [4500.0, 2200.0],
       [30000.0, 12000.0],
       [14000.0, 10000.0]], dtype=object)

In [95]:
imputer = imputer.fit(X[:, 2:4]) #put only the columns with possible missing data

In [96]:
X[:, 2:4] = imputer.transform(X[:, 2:4]) #put only the columns with possible missing data

In [97]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [98]:
X_cargo = pd.DataFrame({'cargo':X[:,0]})

In [99]:
X_cargo

Unnamed: 0,cargo
0,Diretor
1,Analista
2,Programador
3,Gerente
4,Gerente
5,Programador
6,Analista
7,Diretor
8,Fundador
9,Analista


In [100]:
X_cargo = pd.get_dummies(X_cargo)

In [101]:
X_cargo.head()

Unnamed: 0,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,1,0


In [102]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [103]:
X = X[:, 1:]

In [104]:
X

array([[45, 24000.0, 10000.0],
       [22, 8000.0, 2000.0],
       [30, 16795.333333333332, 1000.0],
       [24, 15100.0, 7200.0],
       [30, 35000.0, 6000.0],
       [22, 5300.0, 2000.0],
       [20, 16795.333333333332, 1200.0],
       [50, 18000.0, 8000.0],
       [65, 38000.0, 28000.0],
       [32, 7300.0, 4000.0],
       [35, 2344.0, 7200.0],
       [28, 4500.0, 2200.0],
       [28, 30000.0, 12000.0],
       [30, 14000.0, 10000.0]], dtype=object)

In [105]:
X = pd.DataFrame({'idade':X[:,0], 'salario':X[:,1], 'bonus':X[:,2]})

In [106]:
X.head()

Unnamed: 0,idade,salario,bonus
0,45,24000.0,10000
1,22,8000.0,2000
2,30,16795.3,1000
3,24,15100.0,7200
4,30,35000.0,6000


In [107]:
X = X.join(X_cargo)

In [108]:
Y

array(['sim', 'não', 'não', 'não', 'sim', 'não', 'não', 'sim', 'sim',
       'não', 'não', 'não', 'sim', 'sim'], dtype=object)

In [109]:
Y = pd.get_dummies(Y)

In [110]:
Y

Unnamed: 0,não,sim
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
5,1,0
6,1,0
7,0,1
8,0,1
9,1,0


In [111]:
Y = Y['sim'].values

In [112]:
Y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=uint8)

In [113]:
#labelencoder_X = LabelEncoder() #labelEnconder will transform categorical values in numbers
#X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [114]:
X_normalize = X.copy()
X_normalize[['idade', 'salario', 'bonus']] = Normalizer().fit_transform(X[['idade', 'salario', 'bonus']])
X_normalize

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.001731,0.923076,0.384615,0,1,0,0,0
1,0.002668,0.970139,0.242535,1,0,0,0,0
2,0.001783,0.998231,0.059435,0,0,0,0,1
3,0.001435,0.902638,0.430397,0,0,0,1,0
4,0.000845,0.985622,0.168964,0,0,0,1,0
5,0.003884,0.935595,0.353055,0,0,0,0,1
6,0.001188,0.997457,0.071267,1,0,0,0,0
7,0.002538,0.913809,0.406137,0,1,0,0,0
8,0.001377,0.805055,0.593198,0,0,1,0,0
9,0.003844,0.876969,0.480531,1,0,0,0,0


In [115]:
X_minMax = X.copy()
X_minMax[['idade', 'salario', 'bonus']] = MinMaxScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_minMax

  return self.partial_fit(X, y)


Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.555556,0.607359,0.333333,0,1,0,0,0
1,0.044444,0.158627,0.037037,1,0,0,0,0
2,0.222222,0.405299,0.0,0,0,0,0,1
3,0.088889,0.357752,0.22963,0,0,0,1,0
4,0.222222,0.915863,0.185185,0,0,0,1,0
5,0.044444,0.082903,0.037037,0,0,0,0,1
6,0.0,0.405299,0.007407,1,0,0,0,0
7,0.666667,0.439085,0.259259,0,1,0,0,0
8,1.0,1.0,1.0,0,0,1,0,0
9,0.266667,0.138995,0.111111,1,0,0,0,0


In [116]:
X_standard = X.copy()
X_standard[['idade', 'salario', 'bonus']] = StandardScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_standard

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,1.002248,0.657862,0.415034,0,1,0,0,0
1,-0.907361,-0.803107,-0.770778,1,0,0,0,0
2,-0.243149,0.0,-0.919005,0,0,0,0,1
3,-0.741308,-0.154802,0.0,0,0,0,1,0
4,-0.243149,1.662279,-0.177872,0,0,0,1,0
5,-0.907361,-1.049646,-0.770778,0,0,0,0,1
6,-1.073414,0.0,-0.88936,1,0,0,0,0
7,1.417381,0.109999,0.118581,0,1,0,0,0
8,2.662778,1.936211,3.083113,0,0,1,0,0
9,-0.077096,-0.867024,-0.474325,1,0,0,0,0


In [117]:
X_robust = X.copy()
X_robust[['idade', 'salario', 'bonus']] = RobustScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_robust

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,1.621622,0.535929,0.456376,0,1,0,0,0
1,-0.864865,-0.528963,-0.61745,1,0,0,0,0
2,0.0,0.056417,-0.751678,0,0,0,0,1
3,-0.648649,-0.056417,0.080537,0,0,0,1,0
4,0.0,1.268042,-0.080537,0,0,0,1,0
5,-0.864865,-0.708663,-0.61745,0,0,0,0,1
6,-1.081081,0.056417,-0.724832,1,0,0,0,0
7,2.162162,0.136595,0.187919,0,1,0,0,0
8,3.783784,1.467709,2.872483,0,0,1,0,0
9,0.216216,-0.575552,-0.348993,1,0,0,0,0


In [118]:
X_quantile = X.copy()
X_quantile[['idade', 'salario', 'bonus']] = QuantileTransformer().fit_transform(X[['idade', 'salario', 'bonus']])
X_quantile

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.8463169,0.7692308,0.8078078,0,1,0,0,0
1,0.1151151,0.3073753,0.1921922,1,0,0,0,0
2,0.5385385,0.5765766,1e-07,0,0,0,0,1
3,0.230599,0.4615128,0.5765766,0,0,0,1,0
4,0.5385385,0.9231557,0.4615835,0,0,0,1,0
5,0.1151151,0.1536273,0.1921922,0,0,0,0,1
6,1e-07,0.5765766,0.0766556,1,0,0,0,0
7,0.9229801,0.6919351,0.6920823,0,1,0,0,0
8,0.9999999,0.9999999,0.9999999,0,0,1,0,0
9,0.6922083,0.2310002,0.3845972,1,0,0,0,0


In [119]:
X_power = X.copy()
X_power[['idade', 'salario', 'bonus']] = PowerTransformer().fit_transform(X[['idade', 'salario', 'bonus']])
X_power

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,1.14323,0.768685,0.775384,0,1,0,0,0
1,-1.075613,-0.711676,-0.943097,1,0,0,0,0
2,-0.120103,0.22211,-1.639884,0,0,0,0,1
3,-0.808699,0.072326,0.412848,0,0,0,1,0
4,-0.120103,1.428193,0.214291,0,0,0,1,0
5,-1.075613,-1.131902,-0.943097,0,0,0,0,1
6,-1.366824,0.22211,-1.459056,1,0,0,0,0
7,1.473534,0.322753,0.528451,0,1,0,0,0
8,2.298874,1.584242,1.952536,0,0,1,0,0
9,0.080032,-0.810597,-0.220593,1,0,0,0,0


In [87]:
Y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=uint8)

In [127]:
X_new = SelectKBest(chi2, k=6).fit_transform(X_minMax, Y)

In [128]:
X_new

array([[0.55555556, 0.60735921, 0.33333333, 0.        , 1.        ,
        0.        ],
       [0.04444444, 0.15862688, 0.03703704, 1.        , 0.        ,
        0.        ],
       [0.22222222, 0.40529878, 0.        , 0.        , 0.        ,
        0.        ],
       [0.08888889, 0.35775185, 0.22962963, 0.        , 0.        ,
        0.        ],
       [0.22222222, 0.91586269, 0.18518519, 0.        , 0.        ,
        0.        ],
       [0.04444444, 0.0829033 , 0.03703704, 0.        , 0.        ,
        0.        ],
       [0.        , 0.40529878, 0.00740741, 1.        , 0.        ,
        0.        ],
       [0.66666667, 0.43908459, 0.25925926, 0.        , 1.        ,
        0.        ],
       [1.        , 1.        , 1.        , 0.        , 0.        ,
        1.        ],
       [0.26666667, 0.13899484, 0.11111111, 1.        , 0.        ,
        0.        ],
       [0.33333333, 0.        , 0.22962963, 0.        , 0.        ,
        0.        ],
       [0.17777778, 0

In [130]:
clf = ExtraTreesClassifier()

In [131]:
clf = clf.fit(X_minMax, Y)



In [132]:
clf.feature_importances_ 

array([0.07196759, 0.36210826, 0.31178241, 0.01799979, 0.08791278,
       0.09416667, 0.02203704, 0.03202546])

In [137]:
model = SelectFromModel(clf, prefit=True, threshold="median")
X_new = model.transform(X_minMax)

In [144]:
X_new

array([[ 0.53866608, -0.13267348],
       [-0.1368266 ,  0.93283828],
       [-0.60363558, -0.37644469],
       [ 0.20186796,  0.18208778],
       [ 0.4695611 ,  0.08126605],
       [-0.78274982, -0.29609104],
       [-0.0501463 ,  0.91612012],
       [ 0.47351528, -0.11871599],
       [ 1.21379905, -0.62653624],
       [-0.0615711 ,  0.86322067],
       [-0.6810227 , -0.40144957],
       [-0.7534116 , -0.32754108],
       [ 0.70951022, -0.25128851],
       [-0.53755599, -0.4447923 ]])

In [151]:
X_minMax.head()

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.555556,0.607359,0.333333,0,1,0,0,0
1,0.044444,0.158627,0.037037,1,0,0,0,0
2,0.222222,0.405299,0.0,0,0,0,0,1
3,0.088889,0.357752,0.22963,0,0,0,1,0
4,0.222222,0.915863,0.185185,0,0,0,1,0


In [163]:
pca = PCA(n_components='mle')
X_new = pca.fit_transform(X_minMax) 

In [158]:
X_new

array([[ 5.38666083e-01, -1.32673484e-01, -3.35747301e-01,
         7.97915592e-01,  8.46645321e-02, -5.71454886e-02,
         7.02141336e-02,  9.49215691e-17],
       [-1.36826597e-01,  9.32838282e-01, -2.12709717e-01,
        -1.62716937e-01, -2.87589621e-03,  4.53343737e-02,
         1.86143937e-02,  5.22365692e-17],
       [-6.03635582e-01, -3.76444686e-01,  1.63619623e-02,
        -4.85058731e-02,  7.25472512e-02, -2.33347027e-01,
        -9.46164053e-02,  3.74683587e-17],
       [ 2.01867960e-01,  1.82087779e-01,  9.04576719e-01,
         1.54438518e-01, -6.25774591e-02,  2.63623918e-01,
         3.43141853e-02,  3.77796169e-17],
       [ 4.69561100e-01,  8.12660513e-02,  9.58318756e-01,
         1.28386643e-01, -3.62142939e-02, -2.26275919e-01,
        -3.74659035e-02,  1.80514734e-17],
       [-7.82749815e-01, -2.96091038e-01,  1.03180943e-02,
        -4.70637135e-02,  1.19466556e-01,  5.71938586e-02,
         1.36558209e-02,  4.18691106e-17],
       [-5.01462969e-02,  9.161201

In [32]:
# But with this thing above we get a problem, has you can see cat=2 and dog= 0, so we have a hierarchical value, but cat is not necessarily greater than dog

In [33]:
#So we need to create dummy variable, where cat, moose and dogs can be transofrm in columns and we can fill this columns with 0's and 1's

In [34]:
#onehotencoder = OneHotEncoder(categorical_features = [0]) #this wiil transform this values in columns

In [35]:
#X = onehotencoder.fit_transform(X).toarray()

In [36]:
#X

In [37]:
#labelencoder_y = LabelEncoder()
#y = labelencoder_y.fit_transform(y) #lets go transform yes and no now

In [38]:
#y

In [39]:
#Now lets split pur data in train and test

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

NameError: name 'y' is not defined

In [None]:
# Now for feature scaling

In [None]:
# As you can se we have a variable age with values from 4 to 17

In [31]:
# And woth with values from 48.000 to 83.000

In [32]:
# Once upon a time worth is much bigger then age thats mean that Euclidean distance will be docimnated by worth and will wind up up dominating the age

In [35]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform([y_train])