<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Training" data-toc-modified-id="Training-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Load-Trained-Model" data-toc-modified-id="Load-Trained-Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load Trained Model</a></span></li><li><span><a href="#Visualise-Training" data-toc-modified-id="Visualise-Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Visualise Training</a></span></li><li><span><a href="#Predict-Anomaly" data-toc-modified-id="Predict-Anomaly-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Predict Anomaly</a></span></li></ul></div>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import warnings
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
from numpy.random import seed
from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector, LeakyReLU
from keras.models import Model
from keras import regularizers
from datetime import datetime
from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown, HTML
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

In [None]:
sns.set(color_codes=True)
%matplotlib inline

In [None]:
np.random.seed(1234)

In [None]:
# Prevent warnings from distracting the reader
warnings.filterwarnings('ignore')

# Colour scheme and style selected
theme = ['#1F306E', '#553772', '#8F3B76', '#C7417B', '#F5487F']
colors_palette = sns.palplot(sns.color_palette(theme))
plt.style.use('seaborn')
sns.set(style="white", color_codes=True)
sns.set_palette(colors_palette)

# Forces Matplotlib to use high-quality images
ip = get_ipython()
ibe = ip.configurables[-1]
ibe.figure_formats = {'pdf', 'png'}    

In [None]:
%autosave 60

In [None]:
nl = "\n"

In [None]:
if not os.path.exists('../data'): os.makedirs('../data')  # create data directory
if not os.path.exists('../models'): os.makedirs('../models')  # create trained models directory

In [None]:
##################################
# Define LOG
##################################
def log ():
    now = str(datetime.now())
    print(f'[LOG {now}]')
    return

In [None]:
log()

# load the dataset into the notebook kernel
# ori_dataset = pd.read_csv('../data/train.csv')
ori_dataset = pd.read_csv('/content/drive/My Drive/Project/train.csv')
# inspect the datasets dimensionalities
print(F'Transactional dataset of {ori_dataset.shape[0]} rows and {ori_dataset.shape[1]} columns loaded')

In [None]:
log()
# remove the "ground-truth" label information for the following steps of the lab
label = ori_dataset.pop('label')

In [None]:
# select categorical attributes to be "one-hot" encoded
log()

categorical_attr_names = ['WAERS', 'BUKRS', 'KTOSL', 'PRCTR', 'BSCHL', 'HKONT']

# encode categorical attributes into a binary one-hot encoded representation 
ori_dataset_categ_transformed = pd.get_dummies(ori_dataset[categorical_attr_names])

In [None]:
# select "DMBTR" vs. "WRBTR" attribute
log()

numeric_attr_names = ['DMBTR', 'WRBTR']

# add a small epsilon to eliminate zero values from data for log scaling
numeric_attr = ori_dataset[numeric_attr_names] + 1e-7
numeric_attr = numeric_attr.apply(np.log)

# normalize all numeric attributes to the range [0,1]
ori_dataset_numeric_attr = (numeric_attr - numeric_attr.min()) / (numeric_attr.max() - numeric_attr.min())

In [None]:
log ()
# merge categorical and numeric subsets
ori_subset_transformed = pd.concat([ori_dataset_categ_transformed, ori_dataset_numeric_attr], axis = 1)

# inspect final dimensions of pre-processed transactional data
ori_subset_transformed.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = ori_subset_transformed.copy()

In [None]:
ss = MinMaxScaler()
df_ss = ss.fit_transform(df)

In [None]:
from keras.layers import Input, Dense
from keras.models import Model

In [None]:
# latent space dimension
encoding_dim = 2

# input placeholder
input_data = Input(shape = (df_ss.shape[1],))

# encoded input
encoded = Dense(512, activation = 'relu', activity_regularizer = regularizers.l1(10e-5) ) (input_data)
encoded = Dense(256, activation='relu')(encoded)
encoded = Dense(128, activation='relu')(encoded)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(32, activation='relu')(encoded)
encoded = Dense(16, activation='relu')(encoded)
encoded = Dense(4, activation='relu')(encoded)
encoded = Dense(encoding_dim, activation='relu')(encoded)

# decoded input
decoded = Dense(4, activation='relu')(encoded)
decoded = Dense(16, activation='relu')(decoded)
decoded = Dense(32, activation='relu')(decoded)
decoded = Dense(64, activation='relu')(decoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(256, activation='relu')(decoded)
decoded = Dense(512, activation='relu')(decoded)
decoded = Dense(df_ss.shape[1], activation='sigmoid')(decoded)

# build autoencoder model
autoencoder = Model (input_data, decoded)

# build encoder for autoencoder model
encoder = Model (input_data, encoded)

# build decoder for autoencoder model
# encoded_input = Input (shape = (encoding_dim, ) )
# decoder_layer = autoencoder.layers[-1]
# decoder = Model (encoded_input, decoder_layer (encoded_input) )

autoencoder.compile (optimizer = 'adam', loss = 'binary_crossentropy')

In [None]:
print(f'Training data shape {df_ss.shape}')

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')

In [None]:
# checkpoint
# filepath="../models/04_autoencoder_knn/{epoch:04}.hdf5"
filepath="/content/drive/My Drive/Project/deep_ae/{epoch:04}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, mode='max')

In [None]:
# csv_logger = CSVLogger("../models/04_autoencoder_knn/history.csv", append=True)
csv_logger = CSVLogger("/content/drive/My Drive/Project/deep_ae/history.csv", append=True)

In [None]:
autoencoder.summary()

## Training

In [None]:
# epochs = 10000
# batch_size = 64
# autoencoder_history = autoencoder.fit(
#     df_ss, 
#     df_ss, 
#     epochs = epochs,
#     batch_size = batch_size,
#     shuffle = False,
#     verbose = 2,
#     validation_split = (1 / 3),
#     callbacks = [
#                  early_stopping, 
#                  checkpoint, 
#                  csv_logger
#                  ]
# ).history

## Load Trained Model

In [None]:
# autoencoder_history = pd.read_csv("../models/04_autoencoder_knn/history.csv")
autoencoder_history = pd.read_csv("/content/drive/My Drive/Project/deep_ae/history.csv")

In [None]:
# autoencoder.load_weights(f"../models/04_autoencoder_knn/0070.hdf5")
autoencoder.load_weights(f"/content/drive/My Drive/Project/deep_ae/0125.hdf5")
autoencoder.compile (optimizer = 'adam', loss = 'binary_crossentropy')
autoencoder.summary()

## Visualise Training

In [None]:
fig, ax = plt.subplots(figsize = (16,9), dpi = 72)
ax.plot(autoencoder_history['loss'], 'b', label='Train', linewidth = 2)
ax.plot(autoencoder_history['val_loss'], 'r', label = 'Validation', linewidth = 2)
ax.set_title('Model Loss', fontsize = 20)
ax.set_ylabel('Loss (MSE)')
ax.set_xlabel('Epoch')
# ax.set_xlim(5,20)
# ax.set_ylim(0.02,0.03)
ax.legend(loc='best')
plt.show

In [None]:
# encoded_data = encoder.predict(df_ss)
X_pred_train = autoencoder.predict(df_ss)
X_pred_train = pd.DataFrame(X_pred_train, columns = df.columns)
X_pred_train.index = df.index

In [None]:
autoencoder_scored = pd.DataFrame(index = df.index)
# Xtrain = X_train.reshape(X_train.shape[0], X_train.shape[2])
autoencoder_scored['loss_mse'] = np.mean(np.abs(X_pred_train-df_ss), axis = 1)

In [None]:
plt.figure(figsize=(16,9), dpi=72)
plt.title('Loss Distribution', fontsize=20)
sns.distplot(autoencoder_scored['loss_mse'], bins=20, kde=True, color='blue')
# plt.xlim([0.0,0.02])
plt.xticks(np.arange(0,0.05,0.005));

## Predict Anomaly

In [None]:
threshold = 0.02

In [None]:
X_pred = autoencoder.predict(df_ss)
X_pred = pd.DataFrame(X_pred, columns = df.columns)
X_pred.index = df.index

In [None]:
autoencoder_scored = pd.DataFrame(index = df.index)
autoencoder_scored['loss_mse'] = np.mean(np.abs(X_pred-df_ss), axis=1)
autoencoder_scored['threshold'] = threshold
autoencoder_scored['pred_anomaly'] = (autoencoder_scored['loss_mse'] > autoencoder_scored['threshold'])
autoencoder_scored['label'] = label

In [None]:
autoencoder_scored[(autoencoder_scored.pred_anomaly == True) & (autoencoder_scored.label != "regular")].label.value_counts()

In [None]:
autoencoder_scored[(autoencoder_scored.pred_anomaly == True) & (autoencoder_scored.label == "regular")].loss_mse.value_counts()

In [None]:
autoencoder_scored[(autoencoder_scored.pred_anomaly == False) & (autoencoder_scored.label != "regular")].label.value_counts()

In [None]:
autoencoder_scored[(autoencoder_scored.pred_anomaly == False) & (autoencoder_scored.label == "regular")].label.value_counts()

|                      | Actual<br>Anomaly | Actual<br>Regular |
|----------------------|-------------------|-------------------|
| Predicted<br>Anomaly | 100                | 0                 |
| Predicted<br>Regular | 0                 | 532,909           |

In [1]:
from datetime import datetime
start_time = datetime.now()

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import os
import csv
import warnings
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
from numpy.random import seed
from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector, LeakyReLU
from keras.models import Model
from keras import regularizers
from datetime import datetime
from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown, HTML
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger

from keras.layers import Input, Dense
from keras.models import Model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

Using TensorFlow backend.


In [4]:
sns.set(color_codes=True)
%matplotlib inline

In [5]:
np.random.seed(1234)

In [6]:
# Prevent warnings from distracting the reader
warnings.filterwarnings('ignore')   

In [7]:
%autosave 60

Autosaving every 60 seconds


In [8]:
nl = "\n"

In [9]:
if not os.path.exists('../data'): os.makedirs('../data')  # create data directory
if not os.path.exists('../models'): os.makedirs('../models')  # create trained models directory

In [10]:
# load the dataset into the notebook kernel
ori_dataset = pd.read_csv('../data/test_set_d.csv')
# ori_dataset = pd.read_csv('/content/drive/My Drive/Project/test_set_b.csv')

In [11]:
# remove the "ground-truth" label information for the following steps of the lab
label = ori_dataset.pop('label')

In [12]:
# select categorical attributes to be "one-hot" encoded
categorical_attr_names = ['WAERS', 'BUKRS', 'KTOSL', 'PRCTR', 'BSCHL', 'HKONT']

# encode categorical attributes into a binary one-hot encoded representation 
ori_dataset_categ_transformed = pd.get_dummies(ori_dataset[categorical_attr_names])

In [13]:
# select "DMBTR" vs. "WRBTR" attribute

numeric_attr_names = ['DMBTR', 'WRBTR']

# add a small epsilon to eliminate zero values from data for log scaling
numeric_attr = ori_dataset[numeric_attr_names] + 1e-7
numeric_attr = numeric_attr.apply(np.log)

# normalize all numeric attributes to the range [0,1]
ori_dataset_numeric_attr = (numeric_attr - numeric_attr.min()) / (numeric_attr.max() - numeric_attr.min())

In [14]:
# merge categorical and numeric subsets
ori_subset_transformed = pd.concat([ori_dataset_categ_transformed, ori_dataset_numeric_attr], axis = 1)

In [15]:
with open('../data/check_columns.csv', 'r', newline='', encoding='utf-8') as readFile:
# with open('/content/drive/My Drive/Project/check_columns.csv', 'r', newline='', encoding='utf-8') as readFile:
    reader = csv.reader(readFile, dialect='excel')
    check_columns = list(reader)[0]

In [16]:
for i in check_columns:
    if i not in ori_subset_transformed:
        ori_subset_transformed[i] = 0

In [17]:
df = ori_subset_transformed.copy()

In [18]:
ss = MinMaxScaler()
df_ss = ss.fit_transform(df)

In [19]:
from keras.layers import Input, Dense
from keras.models import Model

In [20]:
# latent space dimension
encoding_dim = 2

# input placeholder
input_data = Input(shape = (df_ss.shape[1],))

# encoded input
encoded = Dense(512, activation = 'relu', activity_regularizer = regularizers.l1(10e-5) ) (input_data)
encoded = Dense(256, activation='relu')(encoded)
encoded = Dense(128, activation='relu')(encoded)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(32, activation='relu')(encoded)
encoded = Dense(16, activation='relu')(encoded)
encoded = Dense(4, activation='relu')(encoded)
encoded = Dense(encoding_dim, activation='relu')(encoded)

# decoded input
decoded = Dense(4, activation='relu')(encoded)
decoded = Dense(16, activation='relu')(decoded)
decoded = Dense(32, activation='relu')(decoded)
decoded = Dense(64, activation='relu')(decoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(256, activation='relu')(decoded)
decoded = Dense(512, activation='relu')(decoded)
decoded = Dense(df_ss.shape[1], activation='sigmoid')(decoded)

# build autoencoder model
autoencoder = Model (input_data, decoded)

# build encoder for autoencoder model
encoder = Model (input_data, encoded)

# build decoder for autoencoder model
# encoded_input = Input (shape = (encoding_dim, ) )
# decoder_layer = autoencoder.layers[-1]
# decoder = Model (encoded_input, decoder_layer (encoded_input) )

autoencoder.compile (optimizer = 'adam', loss = 'binary_crossentropy')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [21]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')

In [22]:
epochs = 100
batch_size = 64
autoencoder_history = autoencoder.fit(
    df_ss, 
    df_ss, 
    epochs = epochs,
    batch_size = batch_size,
    shuffle = False,
    verbose = 2,
    validation_split = (1 / 3),
#     callbacks = [
#                  early_stopping, 
#                  ]
).history


Train on 22204 samples, validate on 11103 samples
Epoch 1/100
 - 9s - loss: 0.0700 - val_loss: 0.0314
Epoch 2/100
 - 6s - loss: 0.0312 - val_loss: 0.0314
Epoch 3/100
 - 7s - loss: 0.0312 - val_loss: 0.0313
Epoch 4/100
 - 6s - loss: 0.0312 - val_loss: 0.0313
Epoch 5/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 6/100
 - 7s - loss: 0.0311 - val_loss: 0.0312
Epoch 7/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 8/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 9/100
 - 7s - loss: 0.0311 - val_loss: 0.0312
Epoch 10/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 11/100
 - 7s - loss: 0.0311 - val_loss: 0.0312
Epoch 12/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 13/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 14/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 15/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 16/100
 - 7s - loss: 0.0311 - val_loss: 0.0312
Epoch 17/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoch 18/100
 - 6s - loss: 0.0311 - val_loss: 0.0312
Epoc

In [23]:
anomaly_ratio = 0.0002

In [24]:
head = int(anomaly_ratio * df_ss.shape[0])

In [25]:
threshold = 0.019

In [26]:
X_pred = autoencoder.predict(df_ss)
X_pred = pd.DataFrame(X_pred, columns = df.columns)
X_pred.index = df.index

In [27]:
autoencoder_scored = pd.DataFrame(index = df.index)
autoencoder_scored['anomaly_score'] = np.mean(np.abs(X_pred-df_ss), axis=1)
autoencoder_scored = autoencoder_scored.sort_values('anomaly_score', ascending=False).head(head)

In [28]:
autoencoder_scored.sort_values('anomaly_score', ascending=False)

Unnamed: 0,anomaly_score
30016,0.019385
16131,0.019314
14972,0.019202
11781,0.019186
16651,0.019179
17043,0.019141


In [29]:
label[(label != 'regular')]

Series([], Name: label, dtype: object)

In [30]:
autoencoder_scored['threshold'] = threshold
autoencoder_scored['pred_anomaly'] = (autoencoder_scored.anomaly_score >= autoencoder_scored.threshold)

In [31]:
df_results = autoencoder_scored[(autoencoder_scored.pred_anomaly == True)]['pred_anomaly']

In [32]:
df_results = pd.concat([ori_dataset.iloc[df_results.index]], axis = 1)

In [33]:
html_string = '''
<html>
  <head><title>Fraud Scan Results</title></head>
  <link rel="stylesheet" type="text/css" href="../code/df_style.css"/>
  <body>
    <h1 align = 'center'>Fraud Scan Results</h1>
    <p align = 'center'>Review {n_anomalies} detected issues</p>
    <p align = 'center'>{table}</p>
    <p align = 'center'>Time Taken: {time_taken}</p>
  </body>
</html>.
'''

# OUTPUT AN HTML FILE
with open('../data/results.html', 'w') as f:
    f.write(
        html_string.format(
            n_anomalies = \
            df_results.shape[0],
            table=df_results.to_html(classes='mystyle'), 
            time_taken=(datetime.now() - start_time)
        )
    )
    