library numpy (mengelola komputasi matriks), matplotlib (presentasi data menggunkan grafik atau plot) dan pandas (mengambil atau import data dari luar) untuk preprocessing data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import dataset 

- print(x) cetak nilai 
atribut, print(y) cetak nilai kelas.
- nan = noisy nilai
- [:, :-1] pilih semua baris dan kolom di dataset, kecuali kolom terakhir (negative indexing)
- [:, 0:3] tanpa negative indexing (ada 4 kolom di dataset, kita memilih 3 kolom (0,1,2)
- [:, -1] pilih semua baris, kolom terakhir 


In [2]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
print(x)

[['Bali' 44.0 72000.0]
 ['Bali' 27.0 48000.0]
 ['Jawa' 30.0 54000.0]
 ['Lombok' 38.0 61000.0]
 ['Lombok' 40.0 nan]
 ['Jawa' 35.0 58000.0]
 ['Bali' nan 52000.0]
 ['Lombok' 48.0 79000.0]
 ['Bali' 50.0 83000.0]
 ['Bali' 37.0 67000.0]
 ['Jawa' 38.0 nan]
 ['Jawa' nan 78000.0]
 ['Bali' 41.0 nan]
 ['Lombok' 34.0 70000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes'
 'No']


# Menghilangkan Missing Value

- library sklearn
- class SimpleImputer() : mengganti nilai yang kosong dengan mean kolom.
  - missing_value : penanda bahwa nilai asli tidak ada (NAN -> np.nan)
  - strategy : rata-rata kolom, bisa juga 'medean', 'most_frequent', 'constant'.
- objek harus di fit sesuai kolom yang bersangkuang dengan metode fit()





In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [6]:
print(x)

[['Bali' 44.0 72000.0]
 ['Bali' 27.0 48000.0]
 ['Jawa' 30.0 54000.0]
 ['Lombok' 38.0 61000.0]
 ['Lombok' 40.0 65636.36363636363]
 ['Jawa' 35.0 58000.0]
 ['Bali' 38.5 52000.0]
 ['Lombok' 48.0 79000.0]
 ['Bali' 50.0 83000.0]
 ['Bali' 37.0 67000.0]
 ['Jawa' 38.0 65636.36363636363]
 ['Jawa' 38.5 78000.0]
 ['Bali' 41.0 65636.36363636363]
 ['Lombok' 34.0 70000.0]]


# Encoding data kategori (Atribut)

- matrix X bentuk sebelumnya di kolom Country tipe string, perlu diubah ke numerik ( int / float)
- menggunakan variabel dummy OneHotEncoder dan ColumnTransformer (mengubah kolom sebelumnya yaitu bentuk nilai nominal menjadi bentuk nilai matriks)


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [8]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 40.0 65636.36363636363]
 [0.0 1.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 38.5 52000.0]
 [0.0 0.0 1.0 48.0 79000.0]
 [1.0 0.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 38.0 65636.36363636363]
 [0.0 1.0 0.0 38.5 78000.0]
 [1.0 0.0 0.0 41.0 65636.36363636363]
 [0.0 0.0 1.0 34.0 70000.0]]


# Encoding data kategori (Class/ Label)

- matrix Y hanya akan dibuah ke numerik  (0,1,dst) dengan LabelEncoder

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(y)

[0 1 0 0 1 1 0 1 0 1 0 1 1 0]


# Membagi dataset ke training set dan test set

- test_size : proporsi test set
- train_size : proporsi train size. jika tidak di set, maka akan mengikuti test size (kasus ini 0.8) berlaku kebalikan
- random_state : membuat hasil splitting tetap sama antar runtime / antar mesin. nilai bebas

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [12]:
print(x_train)

[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 38.0 65636.36363636363]
 [0.0 0.0 1.0 40.0 65636.36363636363]
 [1.0 0.0 0.0 27.0 48000.0]
 [1.0 0.0 0.0 41.0 65636.36363636363]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 34.0 70000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [1.0 0.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 38.5 78000.0]
 [0.0 1.0 0.0 35.0 58000.0]]


In [13]:
print(x_test)

[[0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 48.0 79000.0]
 [1.0 0.0 0.0 38.5 52000.0]]


In [14]:
print(y_train)

[0 0 1 1 1 0 0 1 0 1 1]


In [15]:
print(y_test)

[0 1 0]


# Feture Scaling 

- perlu dilakukan skala kolom-kolom yang dibutuhkan. Perbedaan skala dapat menyebabkan kendala dengan estimator.
- 3 scaler di library scikit-learn yang sering digunakan : 
  - StandartScaler : menghilangkan mean (terpusat di 0) dan menskalakan ke variansi (deviasi standar = 1), dengan asumsi data terdistribusi normal (gauss) untuk semua fitur.
  - MinMaxScaler :menskalakan nilai data ke dalam suatu *range*. tidak masalah data non-gaussian.
  - RobustScaler : mirip Min-Max, namun menggunakan range interkuartil. Scaler ini tahan terhadap outlier.


In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [17]:
print(x_train)

[[0.0 1.0 0.0 -1.268920399678151 -1.2557866877410433]
 [0.0 1.0 0.0 0.052558833122763986 -0.04638336603147457]
 [0.0 0.0 1.0 0.38292864132299276 -0.04638336603147457]
 [1.0 0.0 0.0 -1.7644751119784943 -1.87938527549754]
 [1.0 0.0 0.0 0.5481135454231071 -0.04638336603147457]
 [1.0 0.0 0.0 1.0436682577234502 0.6150090755284465]
 [0.0 0.0 1.0 -0.6081807832776935 0.40714287960961426]
 [1.0 0.0 0.0 -0.11262607097735039 0.09534358573136598]
 [1.0 0.0 0.0 2.0347776823241364 1.7582731530820235]
 [0.0 1.0 0.0 0.13515128517282118 1.238607663284943]
 [0.0 1.0 0.0 -0.44299587917757915 -0.8400542959033789]]


In [18]:
print(x_test)

[[0.0 0.0 1.0 0.052558833122763986 -0.5282550020251306]
 [0.0 0.0 1.0 1.7044078741239077 1.3425407612443592]
 [1.0 0.0 0.0 0.13515128517282118 -1.4636528836598754]]
