## Import Library

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder


## Import Dataset

In [7]:
dataset = pd.read_csv('pelanggan.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print (x)
print (y)

[['C001' 'Alice' 'Female' 28.0 'New York' 85000.0]
 ['C002' 'Bob' 'Male' nan 'Los Angeles' 62000.0]
 ['C003' 'Charlie' 'Male' 45.0 'Chicago' 93000.0]
 ['C004' 'David' 'Male' 23.0 'San Francisco' nan]
 ['C005' 'Eva' 'Female' 38.0 'Chicago' 76000.0]
 ['C006' 'Frank' 'Male' 29.0 'New York' 92000.0]
 ['C007' 'Grace' 'Female' nan 'Los Angeles' 65000.0]
 ['C008' 'Hannah' 'Female' 33.0 'Chicago' 71000.0]
 ['C009' 'Ivan' 'Male' 55.0 'New York' nan]
 ['C010' 'Jane' 'Female' 27.0 'San Francisco' 78000.0]]
['Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Missing Values
menghapus missing value & mengisi data yang hilang

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 3:4])
x[:, 3:4] = imputer.transform(x[:, 3:4])
print(x)

[['C001' 'Alice' 'Female' 28.0 'New York' 85000.0]
 ['C002' 'Bob' 'Male' 34.75 'Los Angeles' 62000.0]
 ['C003' 'Charlie' 'Male' 45.0 'Chicago' 93000.0]
 ['C004' 'David' 'Male' 23.0 'San Francisco' nan]
 ['C005' 'Eva' 'Female' 38.0 'Chicago' 76000.0]
 ['C006' 'Frank' 'Male' 29.0 'New York' 92000.0]
 ['C007' 'Grace' 'Female' 34.75 'Los Angeles' 65000.0]
 ['C008' 'Hannah' 'Female' 33.0 'Chicago' 71000.0]
 ['C009' 'Ivan' 'Male' 55.0 'New York' nan]
 ['C010' 'Jane' 'Female' 27.0 'San Francisco' 78000.0]]


## Encoding Data

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)

[[1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 'Alice' 'Female' 28.0
  'New York' 85000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 'Bob' 'Male' 34.75
  'Los Angeles' 62000.0]
 [0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 'Charlie' 'Male' 45.0 'Chicago'
  93000.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 'David' 'Male' 23.0
  'San Francisco' nan]
 [0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 'Eva' 'Female' 38.0 'Chicago'
  76000.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 'Frank' 'Male' 29.0 'New York'
  92000.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 'Grace' 'Female' 34.75
  'Los Angeles' 65000.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 'Hannah' 'Female' 33.0
  'Chicago' 71000.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 'Ivan' 'Male' 55.0 'New York'
  nan]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 'Jane' 'Female' 27.0
  'San Francisco' 78000.0]]


In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print (y)

[1 0 1 0 1 1 0 1 0 1]


In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state = 1)
print (x_train)
print (x_test)
print (y_train)
print (y_test)

[[0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 'Grace' 'Female' 34.75
  'Los Angeles' 65000.0]
 [0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 'Eva' 'Female' 38.0 'Chicago'
  76000.0]
 [1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 'Alice' 'Female' 28.0
  'New York' 85000.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 'David' 'Male' 23.0
  'San Francisco' nan]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 'Bob' 'Male' 34.75
  'Los Angeles' 62000.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 'Hannah' 'Female' 33.0
  'Chicago' 71000.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 'Ivan' 'Male' 55.0 'New York'
  nan]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 'Frank' 'Male' 29.0 'New York'
  92000.0]]
[[0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 'Charlie' 'Male' 45.0 'Chicago'
  93000.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 'Jane' 'Female' 27.0
  'San Francisco' 78000.0]]
[0 1 1 0 0 1 0 1]
[1 1]


## Feature Scalling

In [18]:
from sklearn.preprocessing import StandardScaler
x_train = np.array([['France', 44, 72000],
                    ['Spain', 27, 48000],
                    ['Germany', 30, 54000]])
x_test = np.array([['Spain', 38, 61000],
                   ['Germany', 40, 68000],
                   ['France', 35, 58000]])

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x_train = np.array(ct.fit_transform(x_train))
x_test = np.array(ct.transform(x_test))
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

print(x_train)
print(x_test)


[['1.0' '0.0' '0.0' '1.3947559390641278' '1.3728129459672882']
 ['0.0' '0.0' '1.0' '-0.8998425413316948' '-0.9805806756909201']
 ['0.0' '1.0' '0.0' '-0.49491339773243204' '-0.39223227027636803']]
[['0.0' '0.0' '1.0' '0.5848976518656022' '0.294174202707276']
 ['0.0' '1.0' '0.0' '0.8548504142651108' '0.9805806756909201']
 ['1.0' '0.0' '0.0' '0.17996850826633937' '0.0']]
