# Train using RNN architecture

In [1]:
import pandas as pd
import numpy as np

import keras
from keras.layers import TimeDistributed, GRU, Dense, Bidirectional

from sklearn.metrics import roc_auc_score

!python -c 'import tensorflow as tf; print(tf.version)'

Using TensorFlow backend.


<module 'tensorflow._api.v1.version' from '/home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/_api/v1/version/__init__.py'>


# Read/Preprocess Data

In [2]:
def get_string_features(df):
    string_features = []
    
    for col in df.columns:
        if df[col].dtype == np.dtype('object'):
            string_features.append(col)
            
    return string_features


def read_data():
    df = pd.read_csv('./datasets/kfold/transaction_fold_0_0_0.csv')
    
    string_features = get_string_features(df)
    
    df = df.drop(columns=string_features)
    
    return df.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), df['isFraud']

    
%time X, y = read_data()

CPU times: user 2.65 s, sys: 518 ms, total: 3.17 s
Wall time: 3.44 s


# Count the Min Number of Null Cells in Each Row

In [5]:
print(X.shape)

(73816, 377)


In [26]:
min_nan = 999

for index_row in range(X.shape[0]):
    min_nan = min(min_nan, X.iloc[index_row, :].isnull().sum())
    

print(min_nan)

14


# Convert Matrix Data to Sequence Data

In [3]:
def convert_to_sequences(df):
    sequences = np.zeros(shape=(df.shape[0], df.shape[1], 2))

    for i in range(df.shape[0]):
        cnt_col = 0
        
        for j in range(df.shape[1]):
            if not np.isnan(df.iloc[i, j]):
                sequences[i, cnt_col, 0] = j+1
                sequences[i, cnt_col, 1] = df.iloc[i, j]
                
                cnt_col += 1

    return sequences
            

In [7]:
# tmp = convert_to_sequences(X.iloc[:10,:])

In [4]:
%time X = convert_to_sequences(X)

CPU times: user 9min 44s, sys: 278 ms, total: 9min 44s
Wall time: 9min 44s


In [5]:
type(X)

numpy.ndarray

In [6]:
X.shape

(73816, 377, 2)

# Build RNN model

In [14]:
def get_model(input_dim, hidden_size=128, lr=1e-4):
    """
    Args:
        input_dim (tuple): the format of input_dim should be (batch_size, seq_length, feature_size)
    """
    
    inputs = keras.layers.Input(input_dim[1:])
    
    x = GRU(units=hidden_size, return_sequences=False)(inputs)
    
    x = Dense(64, activation='relu', kernel_initializer='uniform')(x)
    
    x = Dense(32, activation='relu', kernel_initializer='uniform')(x)
    
    x = Dense(16, activation='relu', kernel_initializer='uniform')(x)
    
    outputs = Dense(1, activation='sigmoid', kernel_initializer='uniform')(x)
    
    optimizer = keras.optimizers.Adam(lr=lr)
    
    model = keras.models.Model(inputs=inputs,
                              outputs=outputs)
    
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model

model = get_model(input_dim=(None, X.shape[1], 2))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 377, 2)            0         
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               50304     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 17        
Total params: 61,185
Trainable params: 61,185
Non-trainable params: 0
_________________________________________________________________


In [17]:

epochs = 3
batch_size=256

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb85f5060f0>

# Calculate ROC-AUC score

In [19]:
pred_prob = model.predict(X)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))

roc-auc score=0.4980665770006725


In [21]:
pred_prob.max()

0.032714963

# Bidirectional RNN

In [7]:
def get_model(input_dim, hidden_size=256, lr=1e-3):
    """
    Args:
        input_dim (tuple): the format of input_dim should be (batch_size, seq_length, feature_size)
    """
    
    inputs = keras.layers.Input(input_dim[1:])
    
    x = Bidirectional(GRU(units=hidden_size, return_sequences=False))(inputs)
    
    x = Dense(64, activation='relu', kernel_initializer='uniform')(x)
    
    x = Dense(32, activation='relu', kernel_initializer='uniform')(x)
    
    x = Dense(16, activation='relu', kernel_initializer='uniform')(x)
    
    outputs = Dense(1, activation='sigmoid', kernel_initializer='uniform')(x)
    
    optimizer = keras.optimizers.Adam(lr=lr)
    
    model = keras.models.Model(inputs=inputs,
                              outputs=outputs)
    
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model

model = get_model(input_dim=(None, X.shape[1], 2))

model.summary()

W0731 11:50:58.692257 140285407577792 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0731 11:50:58.710162 140285407577792 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0731 11:50:58.716286 140285407577792 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0731 11:50:59.144444 140285407577792 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 377, 2)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               397824    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                32832     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total params: 433,281
Trainable params: 433,281
Non-trainable params: 0
_________________________________________________________________


In [8]:

epochs = 3
batch_size=256

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

W0731 11:51:00.247844 140285407577792 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f9673c5e7b8>

In [11]:
epochs = 1
batch_size=256

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

Epoch 1/1


<keras.callbacks.History at 0x7f9671cd0da0>

# Calculate ROC-AUC score

In [12]:
pred_prob = model.predict(X)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))

roc-auc score=0.8411770522120674


# RNN model with GlobalAverage1D (many-to-many)

In [15]:
def get_model(input_dim, hidden_size=64, lr=1e-3):
    """
    Args:
        input_dim (tuple): the format of input_dim should be (batch_size, seq_length, feature_size)
    """
    
    inputs = keras.layers.Input(input_dim[1:])
    
    x = Bidirectional(GRU(units=hidden_size, return_sequences=True))(inputs)
    
    x = keras.layers.GlobalAveragePooling1D()(x)
    
    x = Dense(64, activation='relu', kernel_initializer='uniform')(x)
    
    x = Dense(32, activation='relu', kernel_initializer='uniform')(x)
    
    x = Dense(16, activation='relu', kernel_initializer='uniform')(x)
    
    outputs = Dense(1, activation='sigmoid', kernel_initializer='uniform')(x)
    
    optimizer = keras.optimizers.Adam(lr=lr)
    
    model = keras.models.Model(inputs=inputs,
                              outputs=outputs)
    
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model

model = get_model(input_dim=(None, X.shape[1], 2))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 377, 2)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 377, 128)          25728     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_10 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_11 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 17        
Total para

In [16]:
epochs = 3
batch_size=128

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f9667394d68>

In [17]:
pred_prob = model.predict(X, batch_size=batch_size)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))

roc-auc score=0.8151643507162256


In [18]:
epochs = 7
batch_size=128

model.fit(X, y,
         epochs=epochs,
         batch_size=batch_size)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f966711a668>

In [19]:
pred_prob = model.predict(X, batch_size=batch_size)

score = roc_auc_score(y, pred_prob)

print('roc-auc score={}'.format(score))

roc-auc score=0.8434597712276892
