# Data Reading

In [1]:
import pandas as pd

account_filename = 'data/accountData.csv'
customer_filename = 'data/customerData.csv'
transaction_filename = 'data/transactionData.csv'
labels_filename = 'data/labelledData_15.csv'

In [2]:
accounts = pd.read_csv(account_filename)

In [3]:
customers = pd.read_csv(customer_filename)

In [5]:
transactions = pd.read_csv(transaction_filename)

In [6]:
labels = pd.read_csv(labels_filename)

# Data Preprocessing

In [4]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8

Using TensorFlow backend.


In [7]:
transactions.head()

Unnamed: 0,TRANSACTION_ID,ORIGIN_CUSTOMER_ID,ORIGIN_ACCOUNT_ID,TRANSACTION_DATE_TIME,TRANSACTION_TYPE,TRANSACTION_SOURCE,TRANSACTION_AMOUNT,BENEFICIARY_CUSTOMER_ID,BENEFICIARY_ACCOUNT_ID
0,0,202950.0,431495.0,2017-01-10 19:21:37,charge,online,508.93,,
1,1,167227.0,355712.0,2017-01-05 12:56:57,withdrawal,atm,20.04,,
2,2,50445.0,107161.0,2017-01-05 21:08:46,charge,online,79.5,,
3,3,102321.0,217511.0,2017-01-09 02:30:10,electronic transfer,ach debit,138.39,1083.0,2287.0
4,4,219263.0,466263.0,2017-01-26 04:12:36,pos,merchant location,695.75,,


In [10]:
# delete transaction id, transaction date time, beneficiary customer id, beneficiary account id
del transactions['TRANSACTION_ID']
del transactions['TRANSACTION_DATE_TIME']
del transactions['BENEFICIARY_CUSTOMER_ID'] 
del transactions['BENEFICIARY_ACCOUNT_ID']

In [24]:
transactions.drop('ORIGIN_CUSTOMER_ID', axis = 1, inplace=True)
transactions.drop('ORIGIN_ACCOUNT_ID', axis = 1, inplace=True)

In [18]:
transactions['TRANSACTION_SOURCE'] = transactions['TRANSACTION_SOURCE'].astype('category')
transactions['TRANSACTION_TYPE'] = transactions['TRANSACTION_TYPE'].astype('category')

In [21]:
# one-shot transfer of transaction type, transaction source
transactions = pd.get_dummies(transactions)

In [37]:
# normalize origin customer id, origin account id, transaction amount
len(transactions.columns)

15

In [34]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # default=(0, 1)
transactions[['TRANSACTION_AMOUNT']] = scaler.fit_transform(transactions[['TRANSACTION_AMOUNT']])

In [36]:
transactions['TRANSACTION_AMOUNT'].describe()

count    1.577224e+07
mean     5.231189e-04
std      3.667259e-03
min      0.000000e+00
25%      1.901000e-06
50%      5.479001e-06
75%      6.926101e-05
max      1.000000e+00
Name: TRANSACTION_AMOUNT, dtype: float64

# Autoencoder

In [48]:
input_layer = Input(shape=(15, ))
encoder = Dense(13, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(11, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(9, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(7, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(5, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(4, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(3, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(2, activation="relu")(encoder)

decoder = Dense(3, activation='tanh')(encoder)
decoder = Dense(4, activation='tanh')(decoder)
decoder = Dense(5, activation='tanh')(decoder)
decoder = Dense(7, activation='tanh')(decoder)
decoder = Dense(9, activation='tanh')(decoder)
decoder = Dense(11, activation='tanh')(decoder)
decoder = Dense(13, activation='tanh')(decoder)
decoder = Dense(15, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

In [49]:
from keras import optimizers
adam = optimizers.Adam(lr=1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

autoencoder.compile(optimizer=adam, loss='mean_squared_logarithmic_error')

In [50]:
X_train = transactions.values

In [None]:
nb_epoch = 1
batch_size = 256
checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)
history = autoencoder.fit(X_train, X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[checkpointer, tensorboard]).history