# Data Reading

In [1]:
import pandas as pd

account_filename = 'data/accountData.csv'
customer_filename = 'data/customerData.csv'
transaction_filename = 'data/transactionData.csv'
labels_filename = 'data/labelledData_15.csv'

In [2]:
accounts = pd.read_csv(account_filename)

In [3]:
customers = pd.read_csv(customer_filename)

In [4]:
transactions = pd.read_csv(transaction_filename)

In [5]:
labels = pd.read_csv(labels_filename)

# Data Preprocessing

In [6]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8

Using TensorFlow backend.


In [None]:
transactions.describe()

In [7]:
print(sum(labels['Class'])/float(len(labels)))

0.000400006720671


In [8]:
print(len(transactions))
n = float(len(transactions))
print((len(transactions) - len(transactions['ORIGIN_CUSTOMER_ID'].dropna()))/n)
print((len(transactions) - len(transactions['ORIGIN_ACCOUNT_ID'].dropna()))/n)
print((len(transactions) - len(transactions['BENEFICIARY_CUSTOMER_ID'].dropna()))/n)
print((len(transactions) - len(transactions['BENEFICIARY_ACCOUNT_ID'].dropna()))/n)

15772235
0.0368721997865236
0.0368721997865236
0.6179363926545604
0.6179363926545604


In [None]:
del transactions['TRANSACTION_ID']
del transactions['BENEFICIARY_CUSTOMER_ID'] 
del transactions['BENEFICIARY_ACCOUNT_ID']

In [9]:
transactions['ORIGIN_CUSTOMER_ID'].fillna(2.5e+05, inplace=True)
transactions['ORIGIN_ACCOUNT_ID'].fillna(5.317910e+05, inplace=True)

In [None]:
transactions['TRANSACTION_SOURCE'] = transactions['TRANSACTION_SOURCE'].astype('category').cat.codes
transactions['TRANSACTION_TYPE'] = transactions['TRANSACTION_TYPE'].astype('category').cat.codes

In [10]:
transactions = transactions.merge(customers, left_on='ORIGIN_CUSTOMER_ID', right_on='CUSTOMER_ID', how='left')

In [17]:
transactions['CUSTOMER_RISK_SCORE'].fillna(transactions['CUSTOMER_RISK_SCORE'].mean(), inplace=True)

In [18]:
transactions['CUSTOMER_TYPE'].fillna('unknown', inplace=True)
transactions['COUNTRY'].fillna('unknown', inplace=True)

0

In [None]:
transactions['TRANSACTION_DATE_TIME'] = pd.to_datetime(transactions['TRANSACTION_DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

In [None]:
transactions['Day'] = transactions['TRANSACTION_DATE_TIME'].dt.strftime('%d').astype('float64')
transactions['Hour'] = transactions['TRANSACTION_DATE_TIME'].dt.strftime('%H').astype('float64')
transactions['Minute'] = transactions['TRANSACTION_DATE_TIME'].dt.strftime('%M').astype('float64')

In [None]:
del transactions['TRANSACTION_DATE_TIME']

In [None]:
len(transactions.columns)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
transactions[['TRANSACTION_AMOUNT']] = scaler.fit_transform(transactions[['TRANSACTION_AMOUNT']])
transactions[['TRANSACTION_TYPE']] = scaler.fit_transform(transactions[['TRANSACTION_TYPE']])
transactions[['TRANSACTION_SOURCE']] = scaler.fit_transform(transactions[['TRANSACTION_SOURCE']])
transactions[['ORIGIN_CUSTOMER_ID']] = scaler.fit_transform(transactions[['ORIGIN_CUSTOMER_ID']])
transactions[['ORIGIN_ACCOUNT_ID']] = scaler.fit_transform(transactions[['ORIGIN_ACCOUNT_ID']])

transactions[['Day']] = scaler.fit_transform(transactions[['Day']])
transactions[['Hour']] = scaler.fit_transform(transactions[['Hour']])
transactions[['Minute']] = scaler.fit_transform(transactions[['Minute']])

In [None]:
transactions.to_csv('data/transactionData_clean.csv')

# Autoencoder

In [None]:
input_layer = Input(shape=(8, ))
encoder = Dense(7, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
#encoder = Dense(7, activation="tanh", 
#                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(6, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(5, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(4, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(3, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(encoder)
#encoder = Dense(3, activation="tanh", 
#                activity_regularizer=regularizers.l1(10e-5))(encoder)
encoder = Dense(2, activation="relu")(encoder)

decoder = Dense(3, activation='tanh')(encoder)
decoder = Dense(4, activation='tanh')(decoder)
decoder = Dense(5, activation='tanh')(decoder)
decoder = Dense(6, activation='tanh')(decoder)
decoder = Dense(7, activation='tanh')(decoder)
decoder = Dense(8, activation='tanh')(decoder)
#decoder = Dense(9, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

In [None]:
from keras import optimizers
adam = optimizers.Adam(lr=1e-6, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

autoencoder.compile(optimizer=adam, loss='mean_squared_error')

In [None]:
X_train = transactions.values

In [None]:
# run on gpu0
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
nb_epoch = 1
batch_size = 8
checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)
history = autoencoder.fit(X_train, X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[checkpointer, tensorboard]).history

# Prediction and Confusion Matrix

In [None]:
predictions = autoencoder.predict(X_train)
mse = np.mean(np.power(X_train - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse, 'true_class': labels['Class']})
error_df.describe()

In [None]:
threshold = 0.075

In [None]:
groups = error_df.groupby('true_class')
fig, ax = plt.subplots()

for name, group in groups:
    ax.plot(group.index, group.reconstruction_error, marker='o', ms=3.5, linestyle='',
            label= "Minor" if name == 1 else "Normal")
ax.hlines(threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
plt.title("Reconstruction error for different classes")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.show();

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df.true_class, y_pred)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=[0, 1], yticklabels=[0, 1], annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()