In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer #Salah satu penanganan missing values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler #Scaling Data

In [2]:
df = pd.read_csv('data_penduduk.csv', sep = ';')
df

Unnamed: 0,Province,Age,Wage,Life Insured
0,Banten,24.0,5000000.0,Yes
1,DKI Jakarta,,3400000.0,No
2,Jawa Barat,60.0,7350000.0,No
3,Banten,34.0,3500000.0,No
4,Jawa Barat,58.0,,Yes
5,DKI Jakarta,,8000000.0,No
6,Banten,21.0,5500000.0,No
7,Banten,44.0,10000000.0,Yes
8,Jawa Barat,40.0,9000000.0,Yes
9,DKI Jakarta,51.0,10500000.0,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Province      15 non-null     object 
 1   Age           13 non-null     float64
 2   Wage          12 non-null     float64
 3   Life Insured  15 non-null     object 
dtypes: float64(2), object(2)
memory usage: 608.0+ bytes


# Get Independen (X) and dependen (y) columns

In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

Contoh Slices

[:, :-1]: pilih semua baris dalam dataset, serta semua kolom kecuali kolom terakhir (ingat negative indexing pada Python).

[:, 0:3]: idem, tanpa negative indexing (ada empat kolom pada dataset, kita memilih tiga kolom: indeks 0, 1, dan 2).

[:, -1]: pilih semua baris, kolom terakhir.

In [5]:
X

Unnamed: 0,Province,Age,Wage
0,Banten,24.0,5000000.0
1,DKI Jakarta,,3400000.0
2,Jawa Barat,60.0,7350000.0
3,Banten,34.0,3500000.0
4,Jawa Barat,58.0,
5,DKI Jakarta,,8000000.0
6,Banten,21.0,5500000.0
7,Banten,44.0,10000000.0
8,Jawa Barat,40.0,9000000.0
9,DKI Jakarta,51.0,10500000.0


# Fix Misisng Value

In [6]:
missing_fix = SimpleImputer(
              missing_values = np.nan,
              strategy = 'mean')

missing_fix = missing_fix.fit(X.iloc[:, 1:3])
X.iloc[:, 1:3] = missing_fix.transform(X.iloc[:, 1:3])

In [7]:
X

Unnamed: 0,Province,Age,Wage
0,Banten,24.0,5000000.0
1,DKI Jakarta,36.0,3400000.0
2,Jawa Barat,60.0,7350000.0
3,Banten,34.0,3500000.0
4,Jawa Barat,58.0,6279167.0
5,DKI Jakarta,36.0,8000000.0
6,Banten,21.0,5500000.0
7,Banten,44.0,10000000.0
8,Jawa Barat,40.0,9000000.0
9,DKI Jakarta,51.0,10500000.0


# OneHotEncoder & LabelEncorder

In [8]:
encode_province = ColumnTransformer([
                                    ('province_name', OneHotEncoder(), [0])],
                                    remainder = 'passthrough'
                                    )

X = encode_province.fit_transform(X).astype(float)

In [9]:
X[:, 0]

array([1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0.])

In [10]:
print(encode_province.named_transformers_['province_name'].categories_)

[array(['Banten', 'DKI Jakarta', 'Jawa Barat'], dtype=object)]


In [11]:
label_encoder_target = LabelEncoder()
y = label_encoder_target.fit_transform(y)
y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1])

In [12]:
y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1])

# Split Data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [14]:
X_train, X_test, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Scaling

In [15]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
X_test

array([[ 1.73205081, -0.84515425, -0.70710678, -0.21649237, -1.24652737],
       [ 1.73205081, -0.84515425, -0.70710678,  0.57075262,  1.68316407],
       [ 1.73205081, -0.84515425, -0.70710678, -1.23991087, -0.34508385]])

In [17]:
X_train

array([[-0.57735027, -0.84515425,  1.41421356,  1.83034461,  0.48875141],
       [ 1.73205081, -0.84515425, -0.70710678, -0.37394137,  0.00610352],
       [-0.57735027, -0.84515425,  1.41421356,  1.67289561,  0.00610352],
       [-0.57735027,  1.18321596, -0.70710678, -0.05904337, -1.29159954],
       [-0.57735027,  1.18321596, -0.70710678, -1.39735986, -1.83246566],
       [ 1.73205081, -0.84515425, -0.70710678, -1.00373737, -0.57044473],
       [-0.57735027,  1.18321596, -0.70710678, -0.92501287, -0.79580561],
       [-0.57735027,  1.18321596, -0.70710678,  1.12182412,  1.90852495],
       [-0.57735027, -0.84515425,  1.41421356,  0.25585462,  1.23244231],
       [-0.57735027, -0.84515425,  1.41421356, -0.53139037,  0.00610352],
       [ 1.73205081, -0.84515425, -0.70710678, -0.53139037,  0.06056574],
       [-0.57735027,  1.18321596, -0.70710678, -0.05904337,  0.78172055]])