In [1]:
#medium article: https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d?gi=c5101f25cd16

In [69]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [70]:
dataset = pd.read_csv('fake_data_2.csv')

In [71]:
dataset.head(30)

Unnamed: 0,cargo,idade,salario,bonus,sócio
0,Diretor,45,24000.0,10000.0,sim
1,Analista,22,8000.0,2000.0,não
2,Programador,30,,1000.0,não
3,Gerente,24,15100.0,,não
4,Gerente,30,35000.0,6000.0,sim
5,Programador,22,5300.0,2000.0,não
6,Analista,20,,1200.0,não
7,Diretor,50,18000.0,8000.0,sim
8,Fundador,65,38000.0,28000.0,sim
9,Analista,32,7300.0,4000.0,não


In [72]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [73]:
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) #This will handle missing data with the mean of the column



In [74]:
X[:, 2:4]

array([[24000.0, 10000.0],
       [8000.0, 2000.0],
       [nan, 1000.0],
       [15100.0, nan],
       [35000.0, 6000.0],
       [5300.0, 2000.0],
       [nan, 1200.0],
       [18000.0, 8000.0],
       [38000.0, 28000.0],
       [7300.0, 4000.0],
       [2344.0, nan],
       [4500.0, 2200.0],
       [30000.0, 12000.0],
       [14000.0, 10000.0]], dtype=object)

In [75]:
imputer = imputer.fit(X[:, 2:4]) #put only the columns with possible missing data

In [76]:
X[:, 2:4] = imputer.transform(X[:, 2:4]) #put only the columns with possible missing data

In [77]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [78]:
X_cargo = pd.DataFrame({'cargo':X[:,0]})

In [79]:
X_cargo

Unnamed: 0,cargo
0,Diretor
1,Analista
2,Programador
3,Gerente
4,Gerente
5,Programador
6,Analista
7,Diretor
8,Fundador
9,Analista


In [80]:
X_cargo = pd.get_dummies(X_cargo)

In [81]:
X_cargo.head()

Unnamed: 0,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,1,0


In [82]:
#X = pd.get_dummies(X)

In [83]:
X

array([['Diretor', 45, 24000.0, 10000.0],
       ['Analista', 22, 8000.0, 2000.0],
       ['Programador', 30, 16795.333333333332, 1000.0],
       ['Gerente', 24, 15100.0, 7200.0],
       ['Gerente', 30, 35000.0, 6000.0],
       ['Programador', 22, 5300.0, 2000.0],
       ['Analista', 20, 16795.333333333332, 1200.0],
       ['Diretor', 50, 18000.0, 8000.0],
       ['Fundador', 65, 38000.0, 28000.0],
       ['Analista', 32, 7300.0, 4000.0],
       ['Programador', 35, 2344.0, 7200.0],
       ['Programador', 28, 4500.0, 2200.0],
       ['Fundador', 28, 30000.0, 12000.0],
       ['Programador', 30, 14000.0, 10000.0]], dtype=object)

In [84]:
X = X[:, 1:]

In [85]:
X

array([[45, 24000.0, 10000.0],
       [22, 8000.0, 2000.0],
       [30, 16795.333333333332, 1000.0],
       [24, 15100.0, 7200.0],
       [30, 35000.0, 6000.0],
       [22, 5300.0, 2000.0],
       [20, 16795.333333333332, 1200.0],
       [50, 18000.0, 8000.0],
       [65, 38000.0, 28000.0],
       [32, 7300.0, 4000.0],
       [35, 2344.0, 7200.0],
       [28, 4500.0, 2200.0],
       [28, 30000.0, 12000.0],
       [30, 14000.0, 10000.0]], dtype=object)

In [86]:
X = pd.DataFrame({'idade':X[:,0], 'salario':X[:,1], 'bonus':X[:,2]})

In [87]:
X.head()

Unnamed: 0,idade,salario,bonus
0,45,24000.0,10000
1,22,8000.0,2000
2,30,16795.3,1000
3,24,15100.0,7200
4,30,35000.0,6000


In [88]:
X.join(X_cargo)

Unnamed: 0,idade,salario,bonus,cargo_Analista,cargo_Diretor,cargo_Fundador,cargo_Gerente,cargo_Programador
0,45,24000.0,10000,0,1,0,0,0
1,22,8000.0,2000,1,0,0,0,0
2,30,16795.3,1000,0,0,0,0,1
3,24,15100.0,7200,0,0,0,1,0
4,30,35000.0,6000,0,0,0,1,0
5,22,5300.0,2000,0,0,0,0,1
6,20,16795.3,1200,1,0,0,0,0
7,50,18000.0,8000,0,1,0,0,0
8,65,38000.0,28000,0,0,1,0,0
9,32,7300.0,4000,1,0,0,0,0


Y

In [89]:
Y

array(['sim', 'não', 'não', 'não', 'sim', 'não', 'não', 'sim', 'sim',
       'não', 'não', 'não', 'sim', 'sim'], dtype=object)

In [91]:
Y = pd.get_dummies(Y)

In [92]:
Y

Unnamed: 0,não,sim
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
5,1,0
6,1,0
7,0,1
8,0,1
9,1,0


In [93]:
Y = Y['sim'].values

In [94]:
Y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=uint8)

In [68]:
Y.values

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=uint8)

In [64]:
Y.head()

0    1
1    0
2    0
3    0
4    1
Name: socio_sim, dtype: uint8

In [167]:
X['cargo'] = X_cargo['cargo']

KeyError: 'cargo'

In [None]:
X

In [132]:
#labelencoder_X = LabelEncoder() #labelEnconder will transform categorical values in numbers
#X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [27]:
X

array([['Diretor', 45, 45.0, 24000.0],
       ['Analista', 22, 22.0, 8000.0],
       ['Programador', 30, 30.0, 7200.0],
       ['Gerente', 24, 24.0, 15100.0],
       ['Gerente', 30, 30.0, 35000.0],
       ['Programador', 22, 22.0, 5300.0],
       ['Analista', 20, 20.0, 7200.0],
       ['Diretor', 50, 50.0, 18000.0],
       ['Fundador', 65, 65.0, 38000.0],
       ['Analista', 32, 32.0, 7300.0],
       ['Programador', 35, 35.0, 2344.0],
       ['Programador', 28, 28.0, 4500.0],
       ['Fundador', 28, 28.0, 30000.0],
       ['Programador', 30, 30.0, 14000.0]], dtype=object)

In [13]:
# But with this thing above we get a problem, has you can see cat=2 and dog= 0, so we have a hierarchical value, but cat is not necessarily greater than dog

In [14]:
#So we need to create dummy variable, where cat, moose and dogs can be transofrm in columns and we can fill this columns with 0's and 1's

In [30]:
#onehotencoder = OneHotEncoder(categorical_features = [0]) #this wiil transform this values in columns

In [31]:
#X = onehotencoder.fit_transform(X).toarray()

In [32]:
#X

In [33]:
#labelencoder_y = LabelEncoder()
#y = labelencoder_y.fit_transform(y) #lets go transform yes and no now

NameError: name 'labelencoder_y' is not defined

In [34]:
#y

In [27]:
#Now lets split pur data in train and test

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [29]:
# Now for feature scaling

In [30]:
# As you can se we have a variable age with values from 4 to 17

In [31]:
# And woth with values from 48.000 to 83.000

In [32]:
# Once upon a time worth is much bigger then age thats mean that Euclidean distance will be docimnated by worth and will wind up up dominating the age

In [35]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform([y_train])