In [100]:
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 18 07:37:58 2020

@author: hwicaksono
"""


import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer


In [101]:
# import dataset
dataset = pd.read_csv(r"D:\Jacobs\Lecture\PTM\Sources\2021\Exploratory Data Analysis and Preprocessing\datasets\Preprocessing.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,US,43.0,71000.0,No
1,UK,28.0,48000.0,Yes
2,Japan,30.0,54000.0,No
3,UK,38.0,60000.0,No
4,Japan,40.0,,Yes
5,US,34.0,58000.0,Yes
6,UK,,53000.0,No
7,US,46.0,79000.0,Yes
8,Japan,50.0,83000.0,No
9,US,37.0,66000.0,Yes


In [102]:
# first three columns as matrix. independent variable
X = dataset.iloc[:,0:3].values
X

array([['US', 43.0, 71000.0],
       ['UK', 28.0, 48000.0],
       ['Japan', 30.0, 54000.0],
       ['UK', 38.0, 60000.0],
       ['Japan', 40.0, nan],
       ['US', 34.0, 58000.0],
       ['UK', nan, 53000.0],
       ['US', 46.0, 79000.0],
       ['Japan', 50.0, 83000.0],
       ['US', 37.0, 66000.0]], dtype=object)

In [103]:
# last column as vector, dependent variable
y = dataset.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [104]:
# Handling missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
imputer = imputer.fit(X[:,1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['US', 43.0, 71000.0],
       ['UK', 28.0, 48000.0],
       ['Japan', 30.0, 54000.0],
       ['UK', 38.0, 60000.0],
       ['Japan', 40.0, 60000.0],
       ['US', 34.0, 58000.0],
       ['UK', 38.0, 53000.0],
       ['US', 46.0, 79000.0],
       ['Japan', 50.0, 83000.0],
       ['US', 37.0, 66000.0]], dtype=object)

In [105]:
# Encoding categorical data using label encoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X

array([[2, 43.0, 71000.0],
       [1, 28.0, 48000.0],
       [0, 30.0, 54000.0],
       [1, 38.0, 60000.0],
       [0, 40.0, 60000.0],
       [2, 34.0, 58000.0],
       [1, 38.0, 53000.0],
       [2, 46.0, 79000.0],
       [0, 50.0, 83000.0],
       [2, 37.0, 66000.0]], dtype=object)

In [106]:
# Encoding categorical data using onehot encoder
onehotencoder = LabelBinarizer()
onehotencoder.fit(dataset['Country'])
transformed = onehotencoder.transform(dataset['Country'])
ohe_df = pd.DataFrame(transformed)
dataset = pd.concat([ohe_df, dataset], axis=1).drop(['Country'], axis=1)
dataset

Unnamed: 0,0,1,2,Age,Salary,Purchased
0,0,0,1,43.0,71000.0,No
1,0,1,0,28.0,48000.0,Yes
2,1,0,0,30.0,54000.0,No
3,0,1,0,38.0,60000.0,No
4,1,0,0,40.0,,Yes
5,0,0,1,34.0,58000.0,Yes
6,0,1,0,,53000.0,No
7,0,0,1,46.0,79000.0,Yes
8,1,0,0,50.0,83000.0,No
9,0,0,1,37.0,66000.0,Yes


In [107]:
# first five columns as matrix. independent variable
X = dataset.iloc[:,0:5].values
X

array([[0.0e+00, 0.0e+00, 1.0e+00, 4.3e+01, 7.1e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.8e+01, 4.8e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.8e+01, 6.0e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+01,     nan],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.4e+01, 5.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00,     nan, 5.3e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.6e+01, 7.9e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 5.0e+01, 8.3e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.7e+01, 6.6e+04]])

In [108]:
# last column as vector, dependent variable
y = dataset.iloc[:,-1].values
print(X)
print(y)

[[0.0e+00 0.0e+00 1.0e+00 4.3e+01 7.1e+04]
 [0.0e+00 1.0e+00 0.0e+00 2.8e+01 4.8e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [0.0e+00 1.0e+00 0.0e+00 3.8e+01 6.0e+04]
 [1.0e+00 0.0e+00 0.0e+00 4.0e+01     nan]
 [0.0e+00 0.0e+00 1.0e+00 3.4e+01 5.8e+04]
 [0.0e+00 1.0e+00 0.0e+00     nan 5.3e+04]
 [0.0e+00 0.0e+00 1.0e+00 4.6e+01 7.9e+04]
 [1.0e+00 0.0e+00 0.0e+00 5.0e+01 8.3e+04]
 [0.0e+00 0.0e+00 1.0e+00 3.7e+01 6.6e+04]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [109]:
# Handling missing data
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
imputer = imputer.fit(X[:,:])
X[:, :] = imputer.transform(X[:, :])
X

array([[0.0e+00, 0.0e+00, 1.0e+00, 4.3e+01, 7.1e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.8e+01, 4.8e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.8e+01, 6.0e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+01, 6.0e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.4e+01, 5.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.8e+01, 5.3e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.6e+01, 7.9e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 5.0e+01, 8.3e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.7e+01, 6.6e+04]])

In [110]:
# Label encoding for y
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [111]:
#Splitting into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
print (X_train)
print (y_train)

[[1.0e+00 0.0e+00 0.0e+00 4.0e+01 6.0e+04]
 [0.0e+00 0.0e+00 1.0e+00 3.7e+01 6.6e+04]
 [0.0e+00 1.0e+00 0.0e+00 2.8e+01 4.8e+04]
 [0.0e+00 1.0e+00 0.0e+00 3.8e+01 5.3e+04]
 [0.0e+00 0.0e+00 1.0e+00 4.6e+01 7.9e+04]
 [0.0e+00 1.0e+00 0.0e+00 3.8e+01 6.0e+04]
 [0.0e+00 0.0e+00 1.0e+00 4.3e+01 7.1e+04]
 [0.0e+00 0.0e+00 1.0e+00 3.4e+01 5.8e+04]]
[1 1 1 0 1 0 0 1]


In [112]:
#Feature scaling

#MinMax Scaler
mmsc_X = MinMaxScaler()
X_train1 = mmsc_X.fit_transform(X_train)
print (X_train1)

#RobustScaler
rs_X = RobustScaler()
X_train2 = rs_X.fit_transform(X_train)
print (X_train2)

#StandardScaler
sc_X = StandardScaler()
X_train3 = sc_X.fit_transform(X_train)
print (X_train3)


[[1.         0.         0.         0.66666667 0.38709677]
 [0.         0.         1.         0.5        0.58064516]
 [0.         1.         0.         0.         0.        ]
 [0.         1.         0.         0.55555556 0.16129032]
 [0.         0.         1.         1.         1.        ]
 [0.         1.         0.         0.55555556 0.38709677]
 [0.         0.         1.         0.83333333 0.74193548]
 [0.         0.         1.         0.33333333 0.32258065]]
[[ 1.          0.         -0.5         0.44444444  0.        ]
 [ 0.          0.          0.5        -0.22222222  0.57142857]
 [ 0.          1.         -0.5        -2.22222222 -1.14285714]
 [ 0.          1.         -0.5         0.         -0.66666667]
 [ 0.          0.          0.5         1.77777778  1.80952381]
 [ 0.          1.         -0.5         0.          0.        ]
 [ 0.          0.          0.5         1.11111111  1.04761905]
 [ 0.          0.          0.5        -0.88888889 -0.19047619]]
[[ 2.64575131 -0.77459667 -1. 