# Training Data

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

In [24]:
data = pd.read_csv('train.csv')
data.head(10)

Unnamed: 0,customer_id,customer_bod,gender,phone_flag,student,employment,credit_card,balance,income,tenure,default
0,8300,1993-08-17,Female,1.0,No,Self Employed,1.0,87104.12,5015120.75,4yrs 4mon,0
1,672,2007-12-17,Female,1.0,Yes,,0.0,89236.34,2266076.58,4yrs 1mon,0
2,5670,2000-02-05,Female,1.0,Yes,,0.0,171553.12,1779347.34,0yrs 9mon,0
3,2975,1999-11-16,Female,1.0,Yes,,0.0,85979.04,2014246.24,1yrs 8mon,0
4,3883,1977-08-18,Male,1.0,No,Salaried,0.0,48874.77,5445148.31,0yrs 10mon,0
5,7825,2002-04-18,Male,1.0,Yes,,0.0,0.0,2390347.61,0yrs 0mon,0
6,6676,1995-08-17,,1.0,No,Salaried,1.0,38546.17,5190882.42,1yrs 1mon,0
7,7991,1984-04-16,Male,1.0,No,Salaried,0.0,0.0,7112035.3,0yrs 0mon,0
8,9412,1998-12-27,Female,1.0,No,Salaried,1.0,103848.03,6455858.31,2yrs 6mon,0
9,9464,1981-03-12,Female,1.0,No,Salaried,0.0,0.0,3444624.3,0yrs 0mon,0


### Preprocessing Training Data

In [25]:
# melihat jumlah kolom dan jumlah baris
data.shape

(3693, 11)

In [26]:
# melihat modus data employement
data['employment'].value_counts()

Salaried         2061
Self Employed     640
Name: employment, dtype: int64

In [27]:
# melihat persentase missing value pada kolom employement.
data['employment'].isnull().sum() * 100 / len(data['employment'])

26.86163011102085

Karena jumlah Missing Value pada kolom `employment` mencapai 26.8%, kita akan mengganti nilai missing value tersebut dengan nilai Modus (Salaried) dengan asumsi bahwa bank kebanyakan menerima customer yang Salaried.

In [28]:
data['employment'] = data['employment'].fillna('Salaried')

Kita akan menghapus kolom `customer_id`, `customer_bod`, dan `gender` karena dirasa merupakan variabel yang tidak berpengaruh kepada hasil prediksi

In [29]:
unnecessary = ['customer_id', 'customer_bod', 'gender']
data = data.drop(unnecessary, axis = 1)

Selanjutnya kita akan melakukan encoding untuk data yang berupa kategori seperti `phone_flag`, `student`, `employment`, dan `credit_card`.  

In [30]:
col_encode = ['phone_flag', 'student', 'employment', 'credit_card']
data = pd.get_dummies(data, columns = col_encode)

In [31]:
data

Unnamed: 0,balance,income,tenure,default,phone_flag_0.0,phone_flag_1.0,student_No,student_Yes,employment_Salaried,employment_Self Employed,credit_card_0.0,credit_card_1.0
0,87104.12,5015120.75,4yrs 4mon,0,0,1,1,0,0,1,0,1
1,89236.34,2266076.58,4yrs 1mon,0,0,1,0,1,1,0,1,0
2,171553.12,1779347.34,0yrs 9mon,0,0,1,0,1,1,0,1,0
3,85979.04,2014246.24,1yrs 8mon,0,0,1,0,1,1,0,1,0
4,48874.77,5445148.31,0yrs 10mon,0,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3688,144844.88,5060638.68,0yrs 9mon,0,1,0,1,0,1,0,1,0
3689,192978.31,2682965.60,1yrs 6mon,0,0,1,0,1,0,1,1,0
3690,141840.01,2172340.05,1yrs 0mon,0,0,1,0,1,1,0,1,0
3691,69477.43,5556566.10,4yrs 9mon,0,0,1,1,0,1,0,1,0


Selanjutnya kita akan mengubah nilai kolom `tenure`menjadi jumlah hari, dengan menggunakan standar bahwa 1 bulan adalah 30 hari dan 1 tahun adalah 360 hari.

In [32]:
data['tenure'] = data['tenure'].str.strip().str.replace('yrs ', '')

In [33]:
data['tenure'] = data['tenure'].str.strip().str.replace('mon', '')

In [34]:
tenure = data['tenure'].to_numpy()

tenure2 = []
for i in range(3693):
    hasil = int(tenure[i][0])*360 + int(tenure[i][1:])*30
    tenure2.append(hasil)
    
n = data.columns[2]
data.drop(n, axis = 1, inplace = True)
data[n] = tenure2

data.head(10)

Unnamed: 0,balance,income,default,phone_flag_0.0,phone_flag_1.0,student_No,student_Yes,employment_Salaried,employment_Self Employed,credit_card_0.0,credit_card_1.0,tenure
0,87104.12,5015120.75,0,0,1,1,0,0,1,0,1,1560
1,89236.34,2266076.58,0,0,1,0,1,1,0,1,0,1470
2,171553.12,1779347.34,0,0,1,0,1,1,0,1,0,270
3,85979.04,2014246.24,0,0,1,0,1,1,0,1,0,600
4,48874.77,5445148.31,0,0,1,1,0,1,0,1,0,300
5,0.0,2390347.61,0,0,1,0,1,1,0,1,0,0
6,38546.17,5190882.42,0,0,1,1,0,1,0,0,1,390
7,0.0,7112035.3,0,0,1,1,0,1,0,1,0,0
8,103848.03,6455858.31,0,0,1,1,0,1,0,0,1,900
9,0.0,3444624.3,0,0,1,1,0,1,0,1,0,0


Supaya lebih sederhana, kita akan melakukan normalisasi terhadap kolom `balance`, `income`, dan `tenure` supaya range nilai-nya berada antara 0 dan 1.

In [35]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [36]:
cols = data.columns.tolist()
cols

['balance',
 'income',
 'default',
 'phone_flag_0.0',
 'phone_flag_1.0',
 'student_No',
 'student_Yes',
 'employment_Salaried',
 'employment_Self Employed',
 'credit_card_0.0',
 'credit_card_1.0',
 'tenure']

In [37]:
cols[0], cols[2] = cols[2], cols[0]
cols

['default',
 'income',
 'balance',
 'phone_flag_0.0',
 'phone_flag_1.0',
 'student_No',
 'student_Yes',
 'employment_Salaried',
 'employment_Self Employed',
 'credit_card_0.0',
 'credit_card_1.0',
 'tenure']

In [38]:
data = data[cols]
data

Unnamed: 0,default,income,balance,phone_flag_0.0,phone_flag_1.0,student_No,student_Yes,employment_Salaried,employment_Self Employed,credit_card_0.0,credit_card_1.0,tenure
0,0,5015120.75,87104.12,0,1,1,0,0,1,0,1,1560
1,0,2266076.58,89236.34,0,1,0,1,1,0,1,0,1470
2,0,1779347.34,171553.12,0,1,0,1,1,0,1,0,270
3,0,2014246.24,85979.04,0,1,0,1,1,0,1,0,600
4,0,5445148.31,48874.77,0,1,1,0,1,0,1,0,300
...,...,...,...,...,...,...,...,...,...,...,...,...
3688,0,5060638.68,144844.88,1,0,1,0,1,0,1,0,270
3689,0,2682965.60,192978.31,0,1,0,1,0,1,1,0,540
3690,0,2172340.05,141840.01,0,1,0,1,1,0,1,0,360
3691,0,5556566.10,69477.43,0,1,1,0,1,0,1,0,1710


In [39]:
col = data.columns.tolist()
col

['default',
 'income',
 'balance',
 'phone_flag_0.0',
 'phone_flag_1.0',
 'student_No',
 'student_Yes',
 'employment_Salaried',
 'employment_Self Employed',
 'credit_card_0.0',
 'credit_card_1.0',
 'tenure']

In [40]:
col[0], col[11] = col[11], col[0]
col

['tenure',
 'income',
 'balance',
 'phone_flag_0.0',
 'phone_flag_1.0',
 'student_No',
 'student_Yes',
 'employment_Salaried',
 'employment_Self Employed',
 'credit_card_0.0',
 'credit_card_1.0',
 'default']

In [41]:
data = data[col]
data

Unnamed: 0,tenure,income,balance,phone_flag_0.0,phone_flag_1.0,student_No,student_Yes,employment_Salaried,employment_Self Employed,credit_card_0.0,credit_card_1.0,default
0,1560,5015120.75,87104.12,0,1,1,0,0,1,0,1,0
1,1470,2266076.58,89236.34,0,1,0,1,1,0,1,0,0
2,270,1779347.34,171553.12,0,1,0,1,1,0,1,0,0
3,600,2014246.24,85979.04,0,1,0,1,1,0,1,0,0
4,300,5445148.31,48874.77,0,1,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3688,270,5060638.68,144844.88,1,0,1,0,1,0,1,0,0
3689,540,2682965.60,192978.31,0,1,0,1,0,1,1,0,0
3690,360,2172340.05,141840.01,0,1,0,1,1,0,1,0,0
3691,1710,5556566.10,69477.43,0,1,1,0,1,0,1,0,0


In [44]:
X = data.iloc[:, [0,1,2]]
X

Unnamed: 0,tenure,income,balance
0,1560,5015120.75,87104.12
1,1470,2266076.58,89236.34
2,270,1779347.34,171553.12
3,600,2014246.24,85979.04
4,300,5445148.31,48874.77
...,...,...,...
3688,270,5060638.68,144844.88
3689,540,2682965.60,192978.31
3690,360,2172340.05,141840.01
3691,1710,5556566.10,69477.43


In [45]:
X = sc.fit_transform(X)
X

array([[ 1.71283156,  0.63621769, -0.29697579],
       [ 1.54538579, -1.07962174, -0.26206696],
       [-0.68722437, -1.38341796,  1.08562806],
       ...,
       [-0.51977861, -1.13812819,  0.59916333],
       [ 1.99190783,  0.97416544, -0.58556096],
       [-0.68722437, -1.64302327,  0.67259511]])

In [48]:
data['tenure'] = X[:, 0]
data['income'] = X[:, 1]
data['balance'] = X[:, 2]
data

Unnamed: 0,tenure,income,balance,phone_flag_0.0,phone_flag_1.0,student_No,student_Yes,employment_Salaried,employment_Self Employed,credit_card_0.0,credit_card_1.0,default
0,1.712832,0.636218,-0.296976,0,1,1,0,0,1,0,1,0
1,1.545386,-1.079622,-0.262067,0,1,0,1,1,0,1,0,0
2,-0.687224,-1.383418,1.085628,0,1,0,1,1,0,1,0,0
3,-0.073257,-1.236804,-0.315396,0,1,0,1,1,0,1,0,0
4,-0.631409,0.904623,-0.922869,0,1,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3688,-0.687224,0.664628,0.648359,1,0,1,0,1,0,1,0,0
3689,-0.184887,-0.819417,1.436402,0,1,0,1,0,1,1,0,0
3690,-0.519779,-1.138128,0.599163,0,1,0,1,1,0,1,0,0
3691,1.991908,0.974165,-0.585561,0,1,1,0,1,0,1,0,0


In [49]:
col_train = ['phone_flag_0.0', 'student_No', 'employment_Salaried', 'credit_card_0.0', 
             'phone_flag_1.0', 'student_Yes', 'employment_Self Employed', 'credit_card_1.0', 
             'income', 'balance', 'tenure']

train_input = np.array(data[col_train])[:3400]
train_label = np.array(data['default'])[:3400]

test_validation = np.array(data[col_train])[3401:]
real_validation = np.array(data['default'])[3401:]

### Building the Model

In [51]:
# model
model = tf.keras.models.Sequential([
    # input layer
    tf.keras.layers.Dense(4, kernel_initializer='normal', input_dim = train_input.shape[1], activation='relu'),
    
    # hidden layer
    tf.keras.layers.Dense(8, kernel_initializer='normal', activation='relu'),
    tf.keras.layers.Dense(16, kernel_initializer='normal', activation='relu'),
    tf.keras.layers.Dense(8, kernel_initializer='normal', activation='relu'),
    
    # output layer
    tf.keras.layers.Dense(1, kernel_initializer='normal', activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 48        
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_2 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 377
Trainable params: 377
Non-trainable params: 0
_________________________________________________________________


In [52]:
# compile
#from tensorflow.keras.optimizers import RMSprop

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [53]:
history = model.fit(train_input, train_label, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [54]:
model.evaluate(test_validation, real_validation)



[0.10299184918403625, 0.9623287916183472]

In [56]:
classification = model.predict(test_validation)
classification

array([[2.67168880e-03],
       [2.51543522e-03],
       [1.95881128e-02],
       [9.13699987e-05],
       [5.29363751e-03],
       [2.96440721e-03],
       [1.12465024e-03],
       [1.52230263e-04],
       [6.10184669e-03],
       [9.59971547e-03],
       [2.18182802e-04],
       [1.24958158e-03],
       [8.97202253e-01],
       [2.76871622e-01],
       [5.64154983e-03],
       [2.14278698e-04],
       [1.10660166e-01],
       [2.93418765e-03],
       [1.25180895e-05],
       [2.04503536e-04],
       [1.20684686e-04],
       [6.77525997e-04],
       [5.46100736e-03],
       [4.47127223e-03],
       [9.94757414e-02],
       [9.63250995e-01],
       [1.43022537e-02],
       [3.75956297e-04],
       [4.94784763e-05],
       [9.39265155e-05],
       [4.05225158e-03],
       [7.00349808e-02],
       [1.17746890e-02],
       [1.76935173e-05],
       [3.37809324e-04],
       [1.32549922e-05],
       [2.62647867e-04],
       [6.60938025e-02],
       [2.65713930e-02],
       [3.47129107e-01],


In [59]:
def step_func(L):
    result = []
    for i in L:
        res = i
        if i > 0.5:
            res = 1
        else :
            res = 0
        result.append(res)
    return result

prediksi = step_func(classification)
#prediksi
for i in range(len(prediksi)):
    print(prediksi[i], real_validation[i])

0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
1 1
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
1 1
0 1
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0


In [61]:
from sklearn.metrics import f1_score

f1_score(prediksi, real_validation)

0.7027027027027029

In [None]:
c1 = data_test['customer_id'].to_numpy()
prediksi = np.array(prediksi)

hasil = {'customer_id':c1, 'default':prediksi}
df = pd.DataFrame(hasil, columns = ['customer_id', 'default'])
df.to_csv("hasil_prediksi.csv", index=False, header=True)