In [1]:
#medium article: https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d?gi=c5101f25cd16

In [108]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

In [70]:
dataset = pd.read_csv('fake_data_2.csv')

In [71]:
dataset.head(30)

Unnamed: 0,cargo,idade,salario,bonus,sócio
0,Diretor,45,24000.0,10000.0,sim
1,Analista,22,8000.0,2000.0,não
2,Programador,30,,1000.0,não
3,Gerente,24,15100.0,,não
4,Gerente,30,35000.0,6000.0,sim
5,Programador,22,5300.0,2000.0,não
6,Analista,20,,1200.0,não
7,Diretor,50,18000.0,8000.0,sim
8,Fundador,65,38000.0,28000.0,sim
9,Analista,32,7300.0,4000.0,não


In [72]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [73]:
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) #This will handle missing data with the mean of the column



In [74]:
X[:, 2:4]

array([[24000.0, 10000.0],
       [8000.0, 2000.0],
       [nan, 1000.0],
       [15100.0, nan],
       [35000.0, 6000.0],
       [5300.0, 2000.0],
       [nan, 1200.0],
       [18000.0, 8000.0],
       [38000.0, 28000.0],
       [7300.0, 4000.0],
       [2344.0, nan],
       [4500.0, 2200.0],
       [30000.0, 12000.0],
       [14000.0, 10000.0]], dtype=object)

In [75]:
imputer = imputer.fit(X[:, 2:4]) #put only the columns with possible missing data

In [76]:
X[:, 2:4] = imputer.transform(X[:, 2:4]) #put only the columns with possible missing data

In [77]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [78]:
X_cargo = pd.DataFrame({'cargo':X[:,0]})

In [79]:
X_cargo

Unnamed: 0,cargo
0,Diretor
1,Analista
2,Programador
3,Gerente
4,Gerente
5,Programador
6,Analista
7,Diretor
8,Fundador
9,Analista


In [80]:
X_cargo = pd.get_dummies(X_cargo)

In [81]:
X_cargo.head()

Unnamed: 0,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,1,0


In [82]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [83]:
X = X[:, 1:]

In [84]:
X

array([[45, 24000.0, 10000.0],
       [22, 8000.0, 2000.0],
       [30, 16795.333333333332, 1000.0],
       [24, 15100.0, 7200.0],
       [30, 35000.0, 6000.0],
       [22, 5300.0, 2000.0],
       [20, 16795.333333333332, 1200.0],
       [50, 18000.0, 8000.0],
       [65, 38000.0, 28000.0],
       [32, 7300.0, 4000.0],
       [35, 2344.0, 7200.0],
       [28, 4500.0, 2200.0],
       [28, 30000.0, 12000.0],
       [30, 14000.0, 10000.0]], dtype=object)

In [85]:
X = pd.DataFrame({'idade':X[:,0], 'salario':X[:,1], 'bonus':X[:,2]})

In [86]:
X.head()

Unnamed: 0,idade,salario,bonus
0,45,24000.0,10000
1,22,8000.0,2000
2,30,16795.3,1000
3,24,15100.0,7200
4,30,35000.0,6000


In [87]:
X = X.join(X_cargo)

In [88]:
Y

array(['sim', 'não', 'não', 'não', 'sim', 'não', 'não', 'sim', 'sim',
       'não', 'não', 'não', 'sim', 'sim'], dtype=object)

In [89]:
Y = pd.get_dummies(Y)

In [90]:
Y

Unnamed: 0,não,sim
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
5,1,0
6,1,0
7,0,1
8,0,1
9,1,0


In [91]:
Y = Y['sim'].values

In [92]:
Y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=uint8)

In [93]:
#labelencoder_X = LabelEncoder() #labelEnconder will transform categorical values in numbers
#X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [104]:
X_normalize = X
X_normalize[['idade', 'salario', 'bonus']] = Normalizer().fit_transform(X[['idade', 'salario', 'bonus']])
X_normalize

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.190404,0.908494,0.372,0,1,0,0,0
1,0.364689,0.910489,0.194967,1,0,0,0,0
2,0.193973,0.981007,0.0,0,0,0,0,1
3,0.12888,0.892882,0.431454,0,0,0,1,0
4,0.0,0.992258,0.124191,0,0,0,1,0
5,0.552205,0.783841,0.284012,0,0,0,0,1
6,0.072164,0.997305,0.013252,1,0,0,0,0
7,0.348555,0.85688,0.379825,0,1,0,0,0
8,0.118981,0.763186,0.63513,0,0,1,0,0
9,0.554072,0.722192,0.414057,1,0,0,0,0


In [103]:
X_minMax = X
X_minMax[['idade', 'salario', 'bonus']] = MinMaxScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_minMax

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.18671,0.890869,0.364783,0,1,0,0,0
1,0.384204,0.959209,0.205399,1,0,0,0,0
2,0.197729,1.0,0.0,0,0,0,0,1
3,0.124306,0.861193,0.416141,0,0,0,1,0
4,0.0,0.981691,0.122868,0,0,0,1,0
5,0.640412,0.909048,0.329379,0,0,0,0,1
6,0.072277,0.998876,0.013273,1,0,0,0,0
7,0.356907,0.877413,0.388927,0,1,0,0,0
8,0.11217,0.719495,0.59877,0,0,1,0,0
9,0.632119,0.82392,0.472381,1,0,0,0,0


In [105]:
X_standard = X
X_standard[['idade', 'salario', 'bonus']] = StandardScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_standard

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,-0.436693,0.464111,0.096181,0,1,0,0,0
1,0.320588,0.472236,-0.71535,1,0,0,0,0
2,-0.421185,0.759551,-1.609088,0,0,0,0,1
3,-0.704018,0.400499,0.368719,0,0,0,1,0
4,-1.264012,0.805394,-1.039791,0,0,0,1,0
5,1.135357,-0.043772,-0.30716,0,0,0,0,1
6,-0.950455,0.825955,-1.548341,1,0,0,0,0
7,0.250482,0.253814,0.132051,0,1,0,0,0
8,-0.747031,-0.127925,1.302385,0,0,1,0,0
9,1.14347,-0.294951,0.288972,1,0,0,0,0


In [107]:
X_robust = X
X_robust[['idade', 'salario', 'bonus']] = RobustScaler().fit_transform(X[['idade', 'salario', 'bonus']])
X_robust

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,-0.01467,0.206574,0.021806,0,1,0,0,0
1,0.439351,0.21883,-0.821707,1,0,0,0,0
2,-0.005372,0.6522,-1.750666,0,0,0,0,1
3,-0.174942,0.110626,0.305085,0,0,0,1,0
4,-0.510681,0.721346,-1.158933,0,0,0,1,0
5,0.927839,-0.559487,-0.39743,0,0,0,0,1
6,-0.322692,0.752359,-1.687525,1,0,0,0,0
7,0.397319,-0.110626,0.059089,0,1,0,0,0
8,-0.20073,-0.68642,1.275545,0,0,1,0,0
9,0.932702,-0.938352,0.222195,1,0,0,0,0


In [109]:
X_quantile = X
X_quantile[['idade', 'salario', 'bonus']] = QuantileTransformer().fit_transform(X[['idade', 'salario', 'bonus']])
X_quantile

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0.3852232,0.6155787,0.538472,0,1,0,0,0
1,0.6918128,0.6918218,0.2307119,1,0,0,0,0
2,0.4615288,0.8465066,1e-07,0,0,0,0,1
3,0.3073988,0.5385036,0.7688451,0,0,0,1,0
4,1e-07,0.9232117,0.153933,0,0,0,1,0
5,0.7697583,0.3844543,0.3080063,0,0,0,0,1
6,0.153918,0.9999999,0.07647297,1,0,0,0,0
7,0.6155844,0.4616062,0.6150475,0,1,0,0,0
8,0.2310776,0.2311839,0.9229506,0,0,1,0,0
9,0.8458578,0.1540049,0.6924514,1,0,0,0,0


In [110]:
X_power = X
X_power[['idade', 'salario', 'bonus']] = PowerTransformer().fit_transform(X[['idade', 'salario', 'bonus']])
X_power

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,-0.345256,0.394406,0.149909,0,1,0,0,0
1,0.633143,0.633123,-0.859101,1,0,0,0,0
2,-0.096984,1.109134,-1.658362,0,0,0,0,1
3,-0.602097,0.150117,0.871434,0,0,0,1,0
4,-1.65835,1.341341,-1.12046,0,0,0,1,0
5,0.874394,-0.347859,-0.600161,0,0,0,0,1
6,-1.12046,1.571425,-1.38868,1,0,0,0,0
7,0.394444,-0.09677,0.39262,0,1,0,0,0
8,-0.857798,-0.857708,1.340649,0,0,1,0,0
9,1.107327,-1.120575,0.634989,1,0,0,0,0


In [13]:
# But with this thing above we get a problem, has you can see cat=2 and dog= 0, so we have a hierarchical value, but cat is not necessarily greater than dog

In [14]:
#So we need to create dummy variable, where cat, moose and dogs can be transofrm in columns and we can fill this columns with 0's and 1's

In [30]:
#onehotencoder = OneHotEncoder(categorical_features = [0]) #this wiil transform this values in columns

In [31]:
#X = onehotencoder.fit_transform(X).toarray()

In [32]:
#X

In [33]:
#labelencoder_y = LabelEncoder()
#y = labelencoder_y.fit_transform(y) #lets go transform yes and no now

NameError: name 'labelencoder_y' is not defined

In [34]:
#y

In [27]:
#Now lets split pur data in train and test

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [29]:
# Now for feature scaling

In [30]:
# As you can se we have a variable age with values from 4 to 17

In [31]:
# And woth with values from 48.000 to 83.000

In [32]:
# Once upon a time worth is much bigger then age thats mean that Euclidean distance will be docimnated by worth and will wind up up dominating the age

In [35]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform([y_train])