# Bank account fraud detection
Daniel Mizrahi (10675418), Antonio La Chira Marquina (11847018)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from data_analysis import *
from read_data import read_dataset
from classifiers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from performance import *

## Parsing the data
We first start off by reading in the data. Since some of the data is categorical and therefore non-numerical we must use one hot encoding to process the data and make sure this won't be a problem when implenting our classifiers. Altough perhaps marginal when converting categorical data we drop the first column so there is less data later on to perform calculations on. The original data without one hot encoding can be used to make conclusions about the data

In [None]:
from read_data import read_dataset

data_original = read_dataset('datasets/Base.csv', process=False)
data = read_dataset('datasets/Base.csv', process=True, drop_first=True)

In [None]:
print('Preview of the one hot encoded data in table form (first 10 rows and 4 columns):')
print(data.head(10).iloc[:, :4])

print(f'\nOrignal data shape (rows, columns): {data_original.shape}')
print(f'One hot encoded data shape (rows, columns): {data.shape}')

In [None]:
# k = 'auto'
k = 50

categorical_data = ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
boolean_data = ['fraud_bool', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'keep_alive_session', 'device_fraud_count']

fig, axs = plt.subplots(16, 2, figsize=(16,100))
column_names = list(data_original.columns.values)
for column, ax in enumerate(axs.flat):
    ax.set_xlabel(column_names[column])
    ax.set_ylabel('Count')
    if column_names[column] in boolean_data:
        labels, counts = np.unique(data_original.iloc[:, column], return_counts=True)
        ax.bar(labels, counts, width=1)
        ax.set_xticks(labels)
        ax.set_xticklabels(['False', 'True'])
    elif column_names[column] in categorical_data:
        labels, counts = np.unique(data_original.iloc[:, column], return_counts=True)
        ax.bar(labels, counts)
    else:
        ax.hist(data_original.iloc[:, column], k)
    if column_names[column] not in categorical_data + boolean_data:
        mean = np.mean(data_original.iloc[:, column])
        standard_deviation = np.std(data_original.iloc[:, column])
        ax.axvline(mean, color='g')
        if mean - standard_deviation > np.amin(data_original.iloc[:, column]):
            ax.axvline(mean - standard_deviation, color='r')
        ax.axvline(mean + standard_deviation, color='r')
        ax.legend([f'mean (approx {round(mean, 2)})', f'standard deviation (approx {round(standard_deviation, 2)})'])

In [None]:
# k = 'auto'
k = 50

categorical_data = ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
boolean_data = ['fraud_bool', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'keep_alive_session', 'device_fraud_count']

data_fraud = data_original.loc[data_original['fraud_bool'] == 1]
data_no_fraud = data_original.loc[data_original['fraud_bool'] == 0]

fig, axs = plt.subplots(16, 2, figsize=(16,100))
column_names = list(data_original.columns.values)
for column, ax in enumerate(axs.flat):
    ax.set_xlabel(column_names[column])
    ax.set_ylabel('Count')
    if column_names[column] in boolean_data:
        lbls_f, cts_f = np.unique(data_fraud.iloc[:, column], return_counts=True)
        lbls_nf, cts_nf = np.unique(data_no_fraud.iloc[:, column], return_counts=True)
        ax.bar(lbls_f, cts_f / len(data_fraud.iloc[:, column]), width=1, label='Fraud', alpha=.6)
        ax.bar(lbls_nf, cts_nf / len(data_no_fraud.iloc[:, column]), width=1, label='No fraud', alpha=.6)
        ax.set_xticks([0, 1])
        ax.set_xticklabels(['False', 'True'])
    elif column_names[column] in categorical_data:
        lbls_f, cts_f = np.unique(data_fraud.iloc[:, column], return_counts=True)
        lbls_nf, cts_nf = np.unique(data_no_fraud.iloc[:, column], return_counts=True)
        ax.bar(lbls_f, cts_f / len(data_fraud.iloc[:, column]), label='Fraud', alpha=.6)
        ax.bar(lbls_nf, cts_nf / len(data_no_fraud.iloc[:, column]), label='No fraud', alpha=.6)
    else:
        ax.hist(data_fraud.iloc[:, column], k, label='Fraud', density=True, alpha=.6)
        ax.hist(data_no_fraud.iloc[:, column], k, label='No fraud', density=True, alpha=.6)
    ax.legend()
    if column_names[column] not in categorical_data + boolean_data:
        handles, labels = ax.get_legend_handles_labels()
        mean_f = np.mean(data_fraud.iloc[:, column])
        mean_nf = np.mean(data_no_fraud.iloc[:, column])
        mn_f = ax.axvline(mean_f, color='g')
        mn_nf = ax.axvline(mean_nf, color='k')
        ax.legend(handles + [mn_f, mn_nf], labels + [f'mean fraud (approx {round(mean_f, 2)})', f'mean no fraud(approx {round(mean_nf, 2)})'])

## Similarity and differences of variables between fraudulent and nonfraudulant

In [None]:
data_fraud = data_original.loc[data_original['fraud_bool'] == 1]
size = data_fraud.shape[0]
iterations = 10000
categorical_data = ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
boolean_data = ['fraud_bool', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards',
                'foreign_request', 'keep_alive_session', 'device_fraud_count']
columns_exclude = categorical_data + boolean_data
test = bootstrap_temp(data_original, data_fraud, size, iterations, columns_exclude=columns_exclude)

k = 50
fig, axs = plt.subplots(len(test), 2, figsize=(16,100))
for column, ax in enumerate(axs.flat):
    ax.set_xlabel(test[int(column / 2)][0] + (' standard deviation' if column % 2 else ' mean'))
    ax.set_ylabel('Count')
    ax.hist(test[int(column / 2)][1 + column % 2], k)
    fraud_statistic = data_fraud[test[int(column / 2)][0]].std() if column % 2 else data_fraud[test[int(column / 2)][0]].mean()
    bootstrap_mean = np.mean(test[int(column / 2)][1 + column % 2])
    ax.axvline(fraud_statistic, color='g')
    ax.axvline(bootstrap_mean, color='r')
    ax.axvline(np.percentile(test[int(column / 2)][1 + column % 2], 2.5), color='black')
    ax.axvline(np.percentile(test[int(column / 2)][1 + column % 2], 97.5), color='black')
    ax.legend([f'Fraud (approx {round(fraud_statistic, 2)}, $p$-value {test[int(column / 2)][3 + column % 2]})',
               f'Bootstrap mean (aprrox {round(bootstrap_mean, 2)})', 'Bootstrap samples'])

Note that the two-sample test checks whether the two data samples come from the same distribution.
This does not specify what that common distribution is (e.g. whether it's normal or not normal).

In [None]:
data_fraud = data.loc[data['fraud_bool'] == 1]
data_no_fraud = data.loc[data['fraud_bool'] == 0]

cont_data = ['income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'credit_risk_score', 'bank_months_count', 'session_length_in_minutes']

data_fraud = data_fraud[cont_data]
data_no_fraud = data_no_fraud[cont_data]

ks_statistics = kolmogorov_smirnov_similarity(data_fraud, data_no_fraud)
ep_statistics = epps_singleton_similarity(data_fraud, data_no_fraud)
ks_most_similar = np.argsort(-ks_statistics)
ep_most_similar = np.argsort(-ep_statistics)

print('Kolmogorov-Smirnov similarity sorted descending (1.0 means complete dissimilarity and 0.0 complete similarity):')
ks_table = list()
for column in ks_most_similar:
    ks_table.append([cont_data[column], ks_statistics[column]])
print(pd.DataFrame(ks_table, columns=['Variable', 'KS statistic']))

print('\nEpps-Singleton similarity sorted descending (lower is better):')
ep_table = list()
for column in ep_most_similar:
    ep_table.append([cont_data[column], ep_statistics[column]])
print(pd.DataFrame(ep_table, columns=['Variable', 'EP statistic']))


In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

pearson_correlations = pearson_correlation_coefficient(data)
spearman_correlations = spearman_correlation_coefficient(data)

fig, ax = plt.subplots(2, 1, figsize=(24,40))

hm_p = ax[0].imshow(np.array(pearson_correlations), cmap=plt.cm.rainbow)
hm_s = ax[1].imshow(np.array(spearman_correlations), cmap=plt.cm.rainbow)

ax[0].set_title('Heatmap of pearson correlations between all parameters', fontsize=20)
ax[0].set_xticks(np.arange(len(data.columns)))
ax[0].set_yticks(np.arange(len(data.columns)))

ax[0].set_xticklabels(data.columns, rotation=90)
ax[0].set_yticklabels(data.columns)

ax[1].set_title('Heatmap of spearman correlations between all parameters', fontsize=20)
ax[1].set_xticks(np.arange(len(data.columns)))
ax[1].set_yticks(np.arange(len(data.columns)))

ax[1].set_xticklabels(data.columns, rotation=90)
ax[1].set_yticklabels(data.columns)

# Add axes underneath both subplots
div1 = make_axes_locatable(ax[0])
div2 = make_axes_locatable(ax[1])

cax1 = div1.new_vertical(size='5%', pad=2.5, pack_start=True)
cax2 = div2.new_vertical(size='5%', pad=2.5, pack_start=True)

fig.add_axes(cax1)
fig.add_axes(cax2)

fig.colorbar(hm_p, cax=cax1, orientation='horizontal')
fig.colorbar(hm_s, cax=cax2, orientation='horizontal')

plt.show()

In [None]:
scaler = MinMaxScaler()

# Select strongest correlation parameters (positive)
params = ['fraud_bool', 'device_os_windows', 'credit_risk_score', 'proposed_credit_limit', 'customer_age']

# Take data from columns specified in 'params'
reg_df = data[params]

# 'fraud_bool' column as target
y = reg_df.values[:, 0]
X = reg_df.values[:, 1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, random_state=5)

In [None]:
clf_log = LogisticRegressionClassifier()
clf_nb = NaiveBayesClassifier()
clf_knn = KNearestNeighborsClassifier()

clf_log.fit(X_train, y_train)
clf_nb.fit(X_train, y_train)
clf_knn.fit(X_train, y_train)

In [None]:
#### Test ####
y_pred_log = clf_log.predict(X_test)
y_pred_nb = clf_nb.predict(X_test)
y_pred_knn = clf_knn.predict(X_test)

print("Accuracy log: ", accuracy_score(y_test, y_pred_log))
print("Accuracy nb: ", accuracy_score(y_test, y_pred_nb))
print("Accuracy knn: ", accuracy_score(y_test, y_pred_knn))

print("Precision log: ", sklearn.metrics.precision_score(y_test, y_pred_log))
print("Precision nb: ", sklearn.metrics.precision_score(y_test, y_pred_nb))
print("Precision knn: ", sklearn.metrics.precision_score(y_test, y_pred_knn))