In [100]:
# Source: https://codesource.io/data-preprocessing-for-machine-learning/
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [101]:
# import dataset
dataset = pd.read_csv('sample_data.csv')
print(dataset)

    Country   Age   Salary Purchased
0    France  44.0  72000.0        No
1     Spain  27.0  48000.0       Yes
2   Germany  30.0  54000.0        No
3     Spain  38.0  61000.0        No
4   Nigeria  18.0  15000.0        No
5   Germany  40.0      NaN       Yes
6    France  35.0  58000.0       Yes
7     Spain   NaN  52000.0        No
8    France  48.0  79000.0       Yes
9   Germany  50.0  83000.0        No
10   France  37.0  67000.0       Yes
11  Nigeria  50.0  60000.0       Yes
12   France  22.0  30000.0        No
13      NaN  44.0  45000.0       Yes
14   France  47.0  78000.0       NaN
15  Nigeria  35.0  43000.0       Yes
16    Spain  34.0  44000.0       Yes
17    Spain  27.0  48000.0       Yes
18    Spain  33.0  48000.0       Yes
19  Nigeria  29.0  77000.0       Yes
20    Spain   NaN  57000.0       Yes
21   France  44.0  48000.0       Yes
22  Germany  50.0  83000.0        No
23   France  37.0  67000.0       Yes
24   France  37.0  23000.0       Yes
25  Germany  45.0  50000.0        No
2

In [102]:
# melihat cuplikan dataset
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Nigeria,18.0,15000.0,No


In [103]:
# statistika deskriptif
dataset.describe()

Unnamed: 0,Age,Salary
count,27.0,28.0
mean,36.925926,53642.857143
std,8.757089,19216.532785
min,18.0,15000.0
25%,30.0,44750.0
50%,37.0,53000.0
75%,44.0,67000.0
max,50.0,83000.0


In [104]:
#
from collections import Counter
Counter(dataset["Country"])

Counter({'France': 10, 'Germany': 5, 'Nigeria': 6, 'Spain': 7, nan: 1})

In [105]:
Counter(dataset["Purchased"])

Counter({'No': 10, 'Yes': 18, nan: 1})

In [106]:
# dropping duplicate values
dataset = dataset.drop_duplicates()
dataset = dataset.reset_index(drop = True)

In [107]:
dataset.describe()

Unnamed: 0,Age,Salary
count,23.0,24.0
mean,36.782609,51541.666667
std,8.852101,19352.517344
min,18.0,15000.0
25%,30.0,43750.0
50%,37.0,51000.0
75%,44.0,62500.0
max,50.0,83000.0


In [108]:
# checking for missing values
dataset.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,True,False
6,False,False,False,False
7,False,True,False,False
8,False,False,False,False
9,False,False,False,False


In [109]:
# Melihat kolom yang memiliki missing value
dataset[dataset.isnull().any(axis = 1)]

Unnamed: 0,Country,Age,Salary,Purchased
5,Germany,40.0,,Yes
7,Spain,,52000.0,No
13,,44.0,45000.0,Yes
14,France,47.0,78000.0,
19,Spain,,57000.0,Yes


In [110]:
# checking the number of missing data
dataset.isnull().sum()

Country      1
Age          2
Salary       1
Purchased    1
dtype: int64

In [111]:
# Dropping data rows with missing values
dataset.dropna(how='any')

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Nigeria,18.0,15000.0,No
6,France,35.0,58000.0,Yes
8,France,48.0,79000.0,Yes
9,Germany,50.0,83000.0,No
10,France,37.0,67000.0,Yes
11,Nigeria,50.0,60000.0,Yes


In [112]:
# Dropping rows with missing values based on a columns
dataset.dropna(how='any', subset = ["Purchased"])

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Nigeria,18.0,15000.0,No
5,Germany,40.0,,Yes
6,France,35.0,58000.0,Yes
7,Spain,,52000.0,No
8,France,48.0,79000.0,Yes
9,Germany,50.0,83000.0,No


In [113]:
# Imputing missing values on numerical features
print(dataset["Age"].mean())
print(dataset["Age"].fillna(dataset["Age"].mean()))

36.78260869565217
0     44.000000
1     27.000000
2     30.000000
3     38.000000
4     18.000000
5     40.000000
6     35.000000
7     36.782609
8     48.000000
9     50.000000
10    37.000000
11    50.000000
12    22.000000
13    44.000000
14    47.000000
15    35.000000
16    34.000000
17    33.000000
18    29.000000
19    36.782609
20    44.000000
21    37.000000
22    45.000000
23    30.000000
24    29.000000
Name: Age, dtype: float64


In [114]:
# Imputing missing values on numerical features
print(dataset["Salary"].median())
print(dataset["Salary"].fillna(dataset["Salary"].median()))

51000.0
0     72000.0
1     48000.0
2     54000.0
3     61000.0
4     15000.0
5     51000.0
6     58000.0
7     52000.0
8     79000.0
9     83000.0
10    67000.0
11    60000.0
12    30000.0
13    45000.0
14    78000.0
15    43000.0
16    44000.0
17    48000.0
18    77000.0
19    57000.0
20    48000.0
21    23000.0
22    50000.0
23    30000.0
24    15000.0
Name: Salary, dtype: float64


In [115]:
# Imputing missing values on numerical features
print(dataset["Country"].mode())
print(dataset["Country"].fillna(dataset["Country"].mode()[0]))

0    France
dtype: object
0      France
1       Spain
2     Germany
3       Spain
4     Nigeria
5     Germany
6      France
7       Spain
8      France
9     Germany
10     France
11    Nigeria
12     France
13     France
14     France
15    Nigeria
16      Spain
17      Spain
18    Nigeria
19      Spain
20     France
21     France
22    Germany
23    Nigeria
24    Nigeria
Name: Country, dtype: object


In [116]:
# Mengisi missing value numerikal dengan mean 
dataset = dataset.fillna(dataset.mean())
# Kategorikal dengan mode
dataset["Country"] = dataset["Country"].fillna(dataset["Country"].mode()[0])
# Hapus missing value pada target
dataset.dropna(subset = ["Purchased"], inplace = True) 

In [117]:
# Splitting dataset into independent and dependent variable
X = dataset[['Country', 'Age', 'Salary']].values
y = dataset['Purchased'].values
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Nigeria' 18.0 15000.0]
 ['Germany' 40.0 51541.666666666664]
 ['France' 35.0 58000.0]
 ['Spain' 36.78260869565217 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]
 ['Nigeria' 50.0 60000.0]
 ['France' 22.0 30000.0]
 ['France' 44.0 45000.0]
 ['Nigeria' 35.0 43000.0]
 ['Spain' 34.0 44000.0]
 ['Spain' 33.0 48000.0]
 ['Nigeria' 29.0 77000.0]
 ['Spain' 36.78260869565217 57000.0]
 ['France' 44.0 48000.0]
 ['France' 37.0 23000.0]
 ['Germany' 45.0 50000.0]
 ['Nigeria' 30.0 30000.0]
 ['Nigeria' 29.0 15000.0]]


In [118]:
print(y)

['No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No']


In [119]:
# replacing the missing values in the age and salary column with the mean
# import the SimpleImputer class from the sklearn library
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [120]:
print(X[:, 1:3])

[[44.0 72000.0]
 [27.0 48000.0]
 [30.0 54000.0]
 [38.0 61000.0]
 [18.0 15000.0]
 [40.0 51541.666666666664]
 [35.0 58000.0]
 [36.78260869565217 52000.0]
 [48.0 79000.0]
 [50.0 83000.0]
 [37.0 67000.0]
 [50.0 60000.0]
 [22.0 30000.0]
 [44.0 45000.0]
 [35.0 43000.0]
 [34.0 44000.0]
 [33.0 48000.0]
 [29.0 77000.0]
 [36.78260869565217 57000.0]
 [44.0 48000.0]
 [37.0 23000.0]
 [45.0 50000.0]
 [30.0 30000.0]
 [29.0 15000.0]]


In [121]:
# Handling Categorical Data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [122]:
print(X)

[[1.0 0.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 30.0 54000.0]
 [0.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 0.0 18.0 15000.0]
 [0.0 1.0 0.0 0.0 40.0 51541.666666666664]
 [1.0 0.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 0.0 1.0 36.78260869565217 52000.0]
 [1.0 0.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 0.0 50.0 60000.0]
 [1.0 0.0 0.0 0.0 22.0 30000.0]
 [1.0 0.0 0.0 0.0 44.0 45000.0]
 [0.0 0.0 1.0 0.0 35.0 43000.0]
 [0.0 0.0 0.0 1.0 34.0 44000.0]
 [0.0 0.0 0.0 1.0 33.0 48000.0]
 [0.0 0.0 1.0 0.0 29.0 77000.0]
 [0.0 0.0 0.0 1.0 36.78260869565217 57000.0]
 [1.0 0.0 0.0 0.0 44.0 48000.0]
 [1.0 0.0 0.0 0.0 37.0 23000.0]
 [0.0 1.0 0.0 0.0 45.0 50000.0]
 [0.0 0.0 1.0 0.0 30.0 30000.0]
 [0.0 0.0 1.0 0.0 29.0 15000.0]]


In [123]:
# Dummies (One hot encoding menggunakan pandas)
pd.get_dummies(dataset.drop(columns = ["Purchased"]))

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Nigeria,Country_Spain
0,44.0,72000.0,1,0,0,0
1,27.0,48000.0,0,0,0,1
2,30.0,54000.0,0,1,0,0
3,38.0,61000.0,0,0,0,1
4,18.0,15000.0,0,0,1,0
5,40.0,51541.666667,0,1,0,0
6,35.0,58000.0,1,0,0,0
7,36.782609,52000.0,0,0,0,1
8,48.0,79000.0,1,0,0,0
9,50.0,83000.0,0,1,0,0


In [124]:
print(y)

['No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No']


In [125]:
# Encoding the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [126]:
print(y)

[0 1 0 0 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0]


In [127]:
# Splitting Dataset into Training and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [128]:
print(X_train)

[[0.0 0.0 0.0 1.0 33.0 48000.0]
 [0.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 0.0 22.0 30000.0]
 [0.0 1.0 0.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 0.0 18.0 15000.0]
 [1.0 0.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 0.0 40.0 51541.666666666664]
 [0.0 0.0 1.0 0.0 29.0 77000.0]
 [0.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 30.0 54000.0]
 [0.0 0.0 0.0 1.0 36.78260869565217 52000.0]
 [1.0 0.0 0.0 0.0 37.0 23000.0]
 [1.0 0.0 0.0 0.0 44.0 48000.0]
 [0.0 0.0 0.0 1.0 36.78260869565217 57000.0]
 [0.0 0.0 1.0 0.0 50.0 60000.0]
 [0.0 0.0 1.0 0.0 29.0 15000.0]
 [1.0 0.0 0.0 0.0 44.0 45000.0]
 [0.0 0.0 0.0 1.0 34.0 44000.0]
 [1.0 0.0 0.0 0.0 48.0 79000.0]]


In [129]:
print(X_test)

[[0.0 1.0 0.0 0.0 45.0 50000.0]
 [1.0 0.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 0.0 30.0 30000.0]
 [1.0 0.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 0.0 35.0 43000.0]]


In [130]:
print(y_train)

[1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1]


In [131]:
print(y_test)

[0 0 1 1 1]


In [132]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
sc = StandardScaler()
mm = MinMaxScaler()
sc.fit_transform(X_train)

array([[-0.67936622, -0.4330127 , -0.51639778,  1.47196014, -0.34254227,
        -0.12587571],
       [-0.67936622, -0.4330127 , -0.51639778,  1.47196014,  0.23274971,
         0.55682441],
       [ 1.47196014, -0.4330127 , -0.51639778, -0.67936622, -1.60818464,
        -1.07115281],
       [-0.67936622,  2.30940108, -0.51639778, -0.67936622,  1.61345047,
         1.71216308],
       [-0.67936622, -0.4330127 ,  1.93649167, -0.67936622, -2.06841822,
        -1.85888372],
       [ 1.47196014, -0.4330127 , -0.51639778, -0.67936622,  0.11769131,
         0.87191677],
       [-0.67936622,  2.30940108, -0.51639778, -0.67936622,  0.4628665 ,
         0.06011631],
       [-0.67936622, -0.4330127 ,  1.93649167, -0.67936622, -0.80277586,
         1.39707071],
       [-0.67936622, -0.4330127 , -0.51639778,  1.47196014, -1.03289265,
        -0.12587571],
       [-0.67936622,  2.30940108, -0.51639778, -0.67936622, -0.68771746,
         0.18921665],
       [-0.67936622, -0.4330127 , -0.51639778,  1.

In [133]:
mm.fit_transform(X_train)

array([[0.        , 0.        , 0.        , 1.        , 0.46875   ,
        0.48529412],
       [0.        , 0.        , 0.        , 1.        , 0.625     ,
        0.67647059],
       [1.        , 0.        , 0.        , 0.        , 0.125     ,
        0.22058824],
       [0.        , 1.        , 0.        , 0.        , 1.        ,
        1.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.59375   ,
        0.76470588],
       [0.        , 1.        , 0.        , 0.        , 0.6875    ,
        0.53737745],
       [0.        , 0.        , 1.        , 0.        , 0.34375   ,
        0.91176471],
       [0.        , 0.        , 0.        , 1.        , 0.28125   ,
        0.48529412],
       [0.        , 1.        , 0.        , 0.        , 0.375     ,
        0.57352941],
       [0.        , 0.        , 0.        , 1.        , 0.58695652,
        0.54411765],
       [1.        , 0