In [1]:
import pandas as pd
import numpy as np
import os
from keras.preprocessing.sequence import pad_sequences
from keras.layers.recurrent import LSTM
from keras.callbacks import EarlyStopping
from keras.layers import Bidirectional, Flatten, Embedding, MaxPooling1D, Dropout, LeakyReLU, Conv1D, BatchNormalization as BatchNorm
from keras.models import Sequential
from keras.layers.core import Dense, Activation

Using TensorFlow backend.


### Open the embedding

In [2]:
path = os.getcwd()
filename = '/data/embedding_LSTM.json'

df = pd.read_json(path + filename)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,status,lstm_embed
0,fraud,"[29, 11, 18, 9, 27, 28, 31, 2, 22, 8, 20, 3, 7..."
1,good,"[115, 18, 125, 94, 103, 27, 28, 122, 88, 118, ..."
2,good,"[293, 126, 18, 110, 302, 176, 768, 767, 454, 1..."
3,good,"[115, 125, 2346, 2323, 1752, 716, 122, 171, 13..."
4,good,"[279, 11, 18, 276, 302, 1601, 2693, 1163, 248,..."


### Vocab size

In [3]:
def find_voc_size(array):
    try:
        return max(array)
    except:
        return 0

voc_size = max(df['lstm_embed'].apply(lambda x: find_voc_size(x)))

### Remove long sentences

 remove sentences with length more than maxlen. Maxlen is chosen according to the hists from explore_dataset.ipynb

In [4]:
def remove_long_sent(df):
    global maxlen
    
    tmp = df.lstm_embed.apply(len)
    short = df[tmp <= maxlen]
    
    return short

In [5]:
maxlen = 300  # maximum number of words in a sentence
df_trunc = remove_long_sent(df)
df_trunc.shape

(10223, 2)

### Compare the distribution of classes before and after truncating

In [6]:
print(df[df.status == 'good'].shape, df[df.status == 'fraud'].shape)
print(df_trunc[df_trunc.status == 'good'].shape, df_trunc[df_trunc.status == 'fraud'].shape)

(8267, 2) (2601, 2)
(7810, 2) (2413, 2)


### Pad sequences

In [7]:
# pad sequences with zeros at the end
X_train = pad_sequences(df_trunc.lstm_embed, maxlen=maxlen, padding='post')

In [8]:
def transform_status(status):
    if status == 'good':
        return 0
    return 1

Y_train = df_trunc.status.apply(lambda status: transform_status(status))

X_train.shape, Y_train.shape

((10223, 300), (10223,))

### Build Bidir LSTM + CNN - classifier

In [9]:
np.random.seed(42)  # fix the random numbers generator state

features_per_word = 10  # number of dimensions for a word embedding
hidden_units = 20  # number of hidden units in the LSTM
batch_size = 64
nb_epochs = 20
nb_classes = 1
dropout = 0.1
# sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=2, verbose=1)

model = Sequential()

model.add(Embedding(voc_size + 1, features_per_word, input_length=maxlen))
model.add(Dropout(dropout))

model.add(Conv1D(filters=128, kernel_size=10))
model.add(BatchNorm())
model.add(LeakyReLU(alpha=0.01))
model.add(MaxPooling1D(pool_size=5, strides=4))

model.add(Bidirectional(LSTM(hidden_units, dropout=dropout)))

model.add(Dropout(dropout))
model.add(Dense(8))
model.add(Activation('linear'))

model.add(Dense(nb_classes))
model.add(Activation('sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

model.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epochs, verbose=1, callbacks=[early_stopping], 
                   validation_split=0.15)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 10)           108620    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 10)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 291, 128)          12928     
_________________________________________________________________
batch_normalization_1 (Batch (None, 291, 128)          512       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 291, 128)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 72, 128)           0         
___________________________________________________________

<keras.callbacks.History at 0x2b2ac1cb79e8>