# MODELLING
## Logistic Regression with Tensorflow Keras
### Fraud Detection System Development using Deep Neural Network for Reported Transactional Data

#### Import Libraries and Dataset

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [92]:
df = pd.read_csv('user_data_ftcr_minmax_train.csv', sep = ',')

In [93]:
df.head()

Unnamed: 0,trx_date,report_date,registereddate,birthday,is_verified,aqc_freq_prepaid_mobile,aqc_mean_prepaid_mobile_amount,aqc_freq_topup,aqc_freq_topup_within_7d,aqc_mean_topup_amount,...,job_position_KARYAWAN,job_position_LAINNYA,job_position_PEGAWAI_NS,job_position_PELAJAR,job_position_RUMAH_TANGGA,job_position_SPESIALIS,job_position_TIDAK_KERJA,job_position_WIRASWASTA,uid,is_scammer
0,0.035808,0.0475,0.261199,1.0,1,0.097276,0.34252,0.171118,0.140187,0.003261,...,0,0,0,0,0,0,0,1,208cc2b1-7e8c-43d2-ba9f-32269abdf078,0
1,0.049782,0.06125,0.440041,0.969442,1,0.128405,0.144488,0.379822,0.205607,0.001861,...,0,0,0,0,0,0,0,1,afbeffbd-b905-4323-ac1a-3ba3e07f6951,1
2,0.452402,0.64125,0.230531,0.963121,1,0.105058,0.118701,0.06726,0.011682,0.000641,...,1,0,0,0,0,0,0,0,24b253b8-84b4-42d4-92b0-7b8537d4f066,1
3,0.027948,0.04,0.020331,0.954634,1,0.0,0.0,0.012859,0.014019,0.006848,...,1,0,0,0,0,0,0,0,32414559-95f1-48fd-83ff-1ff7cc711d88,1
4,0.029694,0.0425,0.062371,0.924845,1,0.0,0.0,0.064293,0.060748,0.00337,...,1,0,0,0,0,0,0,0,80126b5b-7219-4266-b94b-85620d0b4498,1


In [94]:
# df = df.rename(columns={"Unnamed: 0": "id"})
df = df.drop(columns = ['uid'])
df.head()

Unnamed: 0,trx_date,report_date,registereddate,birthday,is_verified,aqc_freq_prepaid_mobile,aqc_mean_prepaid_mobile_amount,aqc_freq_topup,aqc_freq_topup_within_7d,aqc_mean_topup_amount,...,gender_None,job_position_KARYAWAN,job_position_LAINNYA,job_position_PEGAWAI_NS,job_position_PELAJAR,job_position_RUMAH_TANGGA,job_position_SPESIALIS,job_position_TIDAK_KERJA,job_position_WIRASWASTA,is_scammer
0,0.035808,0.0475,0.261199,1.0,1,0.097276,0.34252,0.171118,0.140187,0.003261,...,0,0,0,0,0,0,0,0,1,0
1,0.049782,0.06125,0.440041,0.969442,1,0.128405,0.144488,0.379822,0.205607,0.001861,...,0,0,0,0,0,0,0,0,1,1
2,0.452402,0.64125,0.230531,0.963121,1,0.105058,0.118701,0.06726,0.011682,0.000641,...,0,1,0,0,0,0,0,0,0,1
3,0.027948,0.04,0.020331,0.954634,1,0.0,0.0,0.012859,0.014019,0.006848,...,0,1,0,0,0,0,0,0,0,1
4,0.029694,0.0425,0.062371,0.924845,1,0.0,0.0,0.064293,0.060748,0.00337,...,0,1,0,0,0,0,0,0,0,1


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40717 entries, 0 to 40716
Data columns (total 57 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   trx_date                                      40717 non-null  float64
 1   report_date                                   40717 non-null  float64
 2   registereddate                                40717 non-null  float64
 3   birthday                                      40717 non-null  float64
 4   is_verified                                   40717 non-null  int64  
 5   aqc_freq_prepaid_mobile                       40717 non-null  float64
 6   aqc_mean_prepaid_mobile_amount                40717 non-null  float64
 7   aqc_freq_topup                                40717 non-null  float64
 8   aqc_freq_topup_within_7d                      40717 non-null  float64
 9   aqc_mean_topup_amount                         40717 non-null 

#### Dataset Splitting and Shuffling

In [96]:
# Split Train and Validation Dataset
#from sklearn.model_selection import train_test_split
SPLIT_SIZE = 0.7368421

train_len = int(len(df) * SPLIT_SIZE)

shuffled_files = df.sample(frac = 1, random_state = 1)

test_set = shuffled_files[train_len:]
train_set = shuffled_files[:train_len]

y_train = train_set['is_scammer']
x_train = train_set.drop(columns=['is_scammer'])
y_test = test_set['is_scammer']
x_test = test_set.drop(columns=['is_scammer'])

In [97]:
# Convert to tensors
x_train, y_train = tf.convert_to_tensor(x_train, dtype=tf.float32), tf.convert_to_tensor(y_train, dtype=tf.float32)
x_test, y_test = tf.convert_to_tensor(x_test, dtype=tf.float32), tf.convert_to_tensor(y_test, dtype=tf.float32)

In [98]:
x_train.shape

TensorShape([30001, 56])

#### Modelling

In [108]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=32, activation='relu', input_dim=x_train.shape[1]), #Try & Error Hyperparameter
    tf.keras.layers.Dense(units=32, activation='relu'),
#     tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [109]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy', 
    metrics=[
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.Precision(), 
        tf.keras.metrics.Recall() 
#         tf.keras.metrics.FalseNegatives(),
#         tf.keras.metrics.FalsePositives(),
#         tf.keras.metrics.TruePositives(),
#         tf.keras.metrics.TrueNegatives()
    ]
)

In [110]:
epoch_count = 50

history = model.fit(
    x_train, 
    y_train, 
    epochs=epoch_count, 
    validation_data=(x_test, y_test) 
#     callbacks=[metrics]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50


Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


#### Evaluation

In [102]:
history.history.keys()

dict_keys(['loss', 'binary_accuracy', 'precision_7', 'recall_7', 'val_loss', 'val_binary_accuracy', 'val_precision_7', 'val_recall_7'])

In [111]:
for key in history.history.keys():
    print(str(key) + " : " + str(history.history[key][epoch_count-1]))

loss : 0.30142542719841003
binary_accuracy : 0.869970977306366
precision_8 : 0.8917234539985657
recall_8 : 0.8363809585571289
val_loss : 0.3568621277809143
val_binary_accuracy : 0.8489174842834473
val_precision_8 : 0.8965885043144226
val_recall_8 : 0.7876006960868835


In [55]:
# tr_tp = history.history['true_positives'][epoch_count-1]
# tr_tn = history.history['true_negatives'][epoch_count-1]
# tr_fp = history.history['false_positives'][epoch_count-1]
# tr_fn = history.history['false_negatives'][epoch_count-1]
# val_tp = history.history['val_true_positives'][epoch_count-1]
# val_tn = history.history['val_true_negatives'][epoch_count-1]
# val_fp = history.history['val_false_positives'][epoch_count-1]
# val_fn = history.history['val_false_negatives'][epoch_count-1]

# train_acc = (tr_tp+tr_tn)/(tr_tp+tr_tn+tr_fp+tr_fn)
# val_acc = (val_tp+val_tn)/(val_tp+val_tn+val_fp+val_fn)

# tr_prec = history.history['precision'][epoch_count-1]
# tr_recall = history.history['recall'][epoch_count-1]
# val_prec = history.history['val_precision'][epoch_count-1]
# val_recall = history.history['val_recall'][epoch_count-1]

# train_f1 = (2*tr_prec*tr_recall)/(tr_prec+tr_recall)
# val_f1 = (2*val_prec*val_recall)/(val_prec+val_recall)

In [56]:
# print("Training Accuracy: " + str(train_acc))
# print("Validation Accuracy: " + str(val_acc))
# print("Training F1 score: " + str(train_f1))
# print("Validation F1 score: " + str(val_f1))

In [57]:
# import matplotlib.pyplot as plt

In [58]:
# plt.subplot(2,1,1)
# plt.plot(history.history['precision'])

# plt.subplot(2,1,2)
# plt.plot(history.history['val_precision'])

In [104]:
df_test = pd.read_csv("user_data_ftcr_minmax_test.csv", sep=',')

In [105]:
df_test = df_test.drop(columns=['uid'])
df_test.head()

Unnamed: 0,trx_date,report_date,registereddate,birthday,is_verified,aqc_freq_prepaid_mobile,aqc_mean_prepaid_mobile_amount,aqc_freq_topup,aqc_freq_topup_within_7d,aqc_mean_topup_amount,...,gender_None,job_position_KARYAWAN,job_position_LAINNYA,job_position_PEGAWAI_NS,job_position_PELAJAR,job_position_RUMAH_TANGGA,job_position_SPESIALIS,job_position_TIDAK_KERJA,job_position_WIRASWASTA,is_scammer
0,0.665502,0.9375,0.503446,0.861074,1,1.0,0.094882,0.402572,0.584112,0.001254,...,0,0,0,0,0,1,0,0,0,0
1,0.047162,0.0675,0.052033,0.771775,1,0.0,0.0,0.015826,0.004673,0.00763,...,0,1,0,0,0,0,0,0,0,1
2,0.070742,0.085,0.398001,0.770692,1,0.007782,0.004075,0.02275,0.0,0.000633,...,0,1,0,0,0,0,0,0,0,1
3,0.00786,0.01125,0.198828,0.743068,1,0.0,0.0,0.01088,0.004673,0.00117,...,0,0,0,0,0,1,0,0,0,1
4,0.086463,0.1225,0.062371,0.720752,1,0.0,0.0,0.032641,0.028037,0.000678,...,0,0,0,0,0,1,0,0,0,1


In [106]:
test_ref_labels = df_test['is_scammer']
testing_set = df_test.drop(columns=['is_scammer'])

In [112]:
test_pred_res = model.predict(testing_set)
tp = 0
fp = 0
tn = 0
fn = 0
for i in range (0, len(test_pred_res)):
    if test_pred_res[i] > 0.5:
        if test_ref_labels[i] == 1:
            tp += 1
        else:
            fp += 1
    else:
        if test_ref_labels[i] == 1:
            fn += 1
        else:
            tn += 1
            
print("Test Accuracy: ", (tp+tn)/(tp+tn+fp+fn))
print("Test Precision ", tp/(tp+fn))
print("Test Recall: ", tn/(tn+fp))

Test Accuracy:  0.8323996265172736
Test Precision  0.7603305785123967
Test Recall:  0.9069325735992403


In [75]:
model.input

<KerasTensor: shape=(None, 56) dtype=float32 (created by layer 'dense_18_input')>

#### Save Model

In [67]:
# model.save("log_reg_keras_source_drop.h5")