In [1]:
#medium article: https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d?gi=c5101f25cd16

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [37]:
dataset = pd.read_csv('fake_data_2.csv')

In [38]:
dataset.head(30)

Unnamed: 0,cargo,idade,salario,bonus,sócio
0,Diretor,45,24000.0,10000.0,sim
1,Analista,22,8000.0,2000.0,não
2,Programador,30,,1000.0,não
3,Gerente,24,15100.0,,não
4,Gerente,30,35000.0,6000.0,sim
5,Programador,22,5300.0,2000.0,não
6,Analista,20,,1200.0,não
7,Diretor,50,18000.0,8000.0,sim
8,Fundador,65,38000.0,28000.0,sim
9,Analista,32,7300.0,4000.0,não


In [39]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [40]:
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) #This will handle missing data with the mean of the column



In [41]:
X[:, 2:4]

array([[24000.0, 10000.0],
       [8000.0, 2000.0],
       [nan, 1000.0],
       [15100.0, nan],
       [35000.0, 6000.0],
       [5300.0, 2000.0],
       [nan, 1200.0],
       [18000.0, 8000.0],
       [38000.0, 28000.0],
       [7300.0, 4000.0],
       [2344.0, nan],
       [4500.0, 2200.0],
       [30000.0, 12000.0],
       [14000.0, 10000.0]], dtype=object)

In [42]:
imputer = imputer.fit(X[:, 2:4]) #put only the columns with possible missing data

In [43]:
X[:, 2:4] = imputer.transform(X[:, 2:4]) #put only the columns with possible missing data

In [44]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [45]:
labelencoder_X = LabelEncoder() #labelEnconder will transform categorical values in numbers
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [54]:
X

array([[1, 45, 24000.0, 10000.0],
       [0, 22, 8000.0, 2000.0],
       [4, 30, 16795.333333333332, 1000.0],
       [3, 24, 15100.0, 7200.0],
       [3, 30, 35000.0, 6000.0],
       [4, 22, 5300.0, 2000.0],
       [0, 20, 16795.333333333332, 1200.0],
       [1, 50, 18000.0, 8000.0],
       [2, 65, 38000.0, 28000.0],
       [0, 32, 7300.0, 4000.0],
       [4, 35, 2344.0, 7200.0],
       [4, 28, 4500.0, 2200.0],
       [2, 28, 30000.0, 12000.0],
       [4, 30, 14000.0, 10000.0]], dtype=object)

In [47]:
# But with this thing above we get a problem, has you can see cat=2 and dog= 0, so we have a hierarchical value, but cat is not necessarily greater than dog

In [48]:
#So we need to create dummy variable, where cat, moose and dogs can be transofrm in columns and we can fill this columns with 0's and 1's

In [58]:
onehotencoder = OneHotEncoder(categorical_features = [0]) #this wiil transform this values in columns

In [77]:
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [78]:
X

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.50000000e+01, 2.40000000e+04, 1.00000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.20000000e+01, 8.00000000e+03, 2.00000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 3.00000000e+01, 1.67953333e+04, 1.00000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 2.40000000e+01, 1.51000000e+04, 7.20000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 3.00000000e+01, 3.50000000e+04, 6.00000000e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 2.20000000e+01, 5.30000000e+03, 2.00000000e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.00000000e+01, 1.67953333e+04, 1.

In [79]:
X = pd.DataFrame(np.concatenate(X))

In [None]:
# 'cargo_diretor':X[:,0], 'cargo_analista':X[:,1], 'cargo_programador':X[:,2], 
                      'cargo_gerente':X[:,3], 'cargo_fundador':X[:,4], 'idade':X[:,5], 'salario':X[:,6], 
                       'bonus'

In [75]:
df = pd.DataFrame(X_copy, columns=['cargo_diretor', 'cargo_analista', 'cargo_programador', 'cargo_gerente', 'cargo_fundador', 
                                  'idade', 'salario', 'bonus'])

In [76]:
df

Unnamed: 0,cargo_diretor,cargo_analista,cargo_programador,cargo_gerente,cargo_fundador,idade,salario,bonus
0,0.0,1.0,0.0,0.0,0.0,45.0,24000.0,10000.0
1,1.0,0.0,0.0,0.0,0.0,22.0,8000.0,2000.0
2,0.0,0.0,0.0,0.0,1.0,30.0,16795.333333,1000.0
3,0.0,0.0,0.0,1.0,0.0,24.0,15100.0,7200.0
4,0.0,0.0,0.0,1.0,0.0,30.0,35000.0,6000.0
5,0.0,0.0,0.0,0.0,1.0,22.0,5300.0,2000.0
6,1.0,0.0,0.0,0.0,0.0,20.0,16795.333333,1200.0
7,0.0,1.0,0.0,0.0,0.0,50.0,18000.0,8000.0
8,0.0,0.0,1.0,0.0,0.0,65.0,38000.0,28000.0
9,1.0,0.0,0.0,0.0,0.0,32.0,7300.0,4000.0


In [None]:
df

In [71]:
X_2

Unnamed: 0,0
0,0.000000
1,1.000000
2,0.000000
3,0.000000
4,0.000000
5,45.000000
6,24000.000000
7,10000.000000
8,1.000000
9,0.000000


In [67]:
X_copy = pd.DataFrame({'cargo_diretor':X[:,0], 'cargo_analista':X[:,1], 'cargo_programador':X[:,2], 
                      'cargo_gerente':X[:,3], 'cargo_fundador':X[:,4], 'idade':X[:,5], 'salario':X[:,6], 
                       'bonus':X[:,7]})

IndexError: index 4 is out of bounds for axis 1 with size 4

In [81]:
labelencoder_y = LabelEncoder()
Y = labelencoder_y.fit_transform(Y) #lets go transform yes and no now

In [82]:
Y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1])

In [34]:
#y

In [27]:
#Now lets split pur data in train and test

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [29]:
# Now for feature scaling

In [30]:
# As you can se we have a variable age with values from 4 to 17

In [31]:
# And woth with values from 48.000 to 83.000

In [32]:
# Once upon a time worth is much bigger then age thats mean that Euclidean distance will be docimnated by worth and will wind up up dominating the age

In [35]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform([y_train])