In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import time

sns.set_style("whitegrid")
blue, = sns.color_palette("muted", 1)

In [None]:
# Loads full dataset
JPfraud_full = pd.read_csv("../data/fraud_payment_data")
JPfraud_full

In [None]:
# Removing $0 transactions, which contain no fraud labels
JPfraud = JPfraud_full[JPfraud_full.USD_amount > 0].reset_index(level=0,drop=True)

In [None]:
# Converting strings of timestamps to datetime objects
JPfraud['timestamp'] = pd.to_datetime(JPfraud['Time_step'],format='%Y-%m-%d %H:%M:%S')
JPfraud = JPfraud.drop(columns=['Time_step'])

In [None]:
# Gathering the number of fraud and non-fraud transactions for each transaction type
trans_types = JPfraud.Transaction_Type.unique()
fraud_number = np.zeros(len(trans_types))
non_fraud_number = np.zeros(len(trans_types))

for i, trans in enumerate(trans_types):
    fraud_number[i] = JPfraud.loc[JPfraud.Transaction_Type == trans].Label.sum()
    non_fraud_number[i] = len(JPfraud[JPfraud.Label == 0].loc[JPfraud.Transaction_Type == trans])

In [None]:
plt.figure()

plt.bar(trans_types, fraud_number, edgecolor='black', color='firebrick')
plt.xlabel('Transaction Types',fontsize=10)
plt.ylabel('Number of fraud transactions',fontsize=11)
plt.xticks(np.arange(len(trans_types)),trans_types,rotation=75)
plt.title('Number of Fraud Transactions by Type',fontsize=14)
plt.tight_layout()
plt.savefig('Fraud_number_vs_transaction_types.png')
plt.show()

In [None]:
plt.figure()

w = 0.4

plt.bar(np.arange(len(trans_types))-w/2, non_fraud_number, label='non-fraud', edgecolor='black', width=w)
plt.bar(np.arange(len(trans_types))+w/2, fraud_number, alpha = 0.5, color='red', width=w,
        label='fraud', edgecolor='darkred', linewidth=1.2)
plt.hlines(y=fraud_number[1]+2038, xmin=-0.5, xmax=len(trans_types)-0.5,
           linewidth=0.75, color='red', linestyle='--')
plt.vlines(x=1.2, ymin=fraud_number[1], ymax=245000,
           linewidth=0.75, color='red', linestyle='-')
plt.text(0.25,250000, 'Highest fraud count: 7935', fontsize=8, color='red')
plt.xlabel('Transaction Types',fontsize=10)
plt.ylabel('Number of transactions',fontsize=11)
plt.xticks(np.arange(len(trans_types)), trans_types, rotation=75)
plt.title('Number of Transaction Types: Fraud vs. Non-Fraud',fontsize=14)
plt.legend()
plt.tight_layout()
plt.savefig('fraud_and_non-fraud_vs_transaction_types.png')
plt.show()

In [None]:
# Calculates time between previous transaction for each of seller, beneficiary
# (The mask.....etc code sets the sender_time_btwn, bene_time_btwn feature to 0 (of timedelta type)
# for the first occurrence an account appears. NaT's are still placed where a Sender_Account or
# Bene_Account entry is NaN.)

JPfraud['sender_time_btwn'] = (JPfraud.groupby('Sender_Account')['timestamp'].diff()
                                      .mask(JPfraud.groupby('Sender_Account')['timestamp'].cumcount().eq(0), dt.timedelta(0))
                              )
JPfraud['bene_time_btwn'] = (JPfraud.groupby('Bene_Account')['timestamp'].diff()
                                    .mask(JPfraud.groupby('Bene_Account')['timestamp'].cumcount().eq(0), dt.timedelta(0))
                            )

In [None]:
# The below graph plots the number of fraudulent sender transactions against the time between its previous transaction.
# So, if a transaction was made within an hour of a previous transaction, it was counted in the first bar.
# If the transaction was made within 1 and 2 hours after the previous transaction, it was counted in the second
# bar, and so on.

trans_btwn_counts = np.zeros(26)

for i in np.arange(26):
    trans_btwn_counts[i] = JPfraud[((JPfraud.sender_time_btwn >= dt.timedelta(hours=float(i))) &
                                    (JPfraud.sender_time_btwn < dt.timedelta(hours=float(i+1))))].Label.sum()

plt.figure(figsize=(10,4))
plt.bar(np.linspace(1,26,26), trans_btwn_counts, edgecolor='black')
plt.xlabel('Hours between transactions',fontsize=11)
plt.ylabel('Number of fraud sender transactions',fontsize=11)
plt.title('Number of Fraudulent Sender Transactions vs. Time Between Transactions',fontsize=14)
plt.xticks(np.arange(26)+1)
plt.savefig('sender_hrs_after_fraud.png')
plt.show()

In [None]:
# The below graph plots the number of all sender transactions against the time between its previous transaction.

all_trans_btwn_counts = np.zeros(26)

for i in np.arange(26):
    all_trans_btwn_counts[i] = len(JPfraud[((JPfraud.sender_time_btwn >= dt.timedelta(hours=float(i))) &
                                    (JPfraud.sender_time_btwn < dt.timedelta(hours=float(i+1))))])

plt.figure(figsize=(10,4))
plt.bar(np.linspace(1,26,26), all_trans_btwn_counts, edgecolor='black')
plt.xlabel('Hours between transactions',fontsize=11)
plt.ylabel('Number of all sender transactions',fontsize=11)
plt.xticks(np.arange(26)+1)
plt.title('All Sender Transactions vs. Time Between Transactions',fontsize=14)
plt.savefig('sender_hrs_btwn_all_transactions.png')
plt.show()

In [None]:
# The below graph plots the number of fraudulent beneficiary transactions against the time between
# its previous transaction.

trans_btwn_bene_counts = np.zeros(48)

for i in np.arange(48):
    trans_btwn_bene_counts[i] = JPfraud[((JPfraud.bene_time_btwn >= dt.timedelta(hours=float(i))) &
                                    (JPfraud.bene_time_btwn < dt.timedelta(hours=float(i+1))))].Label.sum()

plt.figure(figsize=(12,4))
plt.bar(np.linspace(1,48,48), trans_btwn_bene_counts, edgecolor='black')
plt.xlabel('Hours between transactions')
plt.ylabel('Number of bene fraud transactions')
plt.xticks(np.arange(50)+1)
plt.title('Number of Fraudulent Beneficiary Transactions vs. Time Between Transactions')
plt.savefig('bene_hrs_btwn_all_transactions.png')
plt.show()

In [None]:
# The below graph plots the number of all beneficiary transactions against the time between its previous transaction.

trans_btwn_all_bene_counts = np.zeros(48)

for i in np.arange(48):
    trans_btwn_all_bene_counts[i] = len(JPfraud[((JPfraud.bene_time_btwn >= dt.timedelta(hours=float(i))) &
                                    (JPfraud.bene_time_btwn < dt.timedelta(hours=float(i+1))))])

plt.figure(figsize=(12,4))
plt.bar(np.linspace(1,48,48), trans_btwn_all_bene_counts, edgecolor='black')
plt.xlabel('Hours between transactions',fontsize=11)
plt.ylabel('Number of bene transactions',fontsize=11)
plt.xticks(np.arange(50)+1)
plt.title('All Beneficiary Transactions vs. Time Between Transactions',fontsize=14)
plt.savefig('bene_hrs_btwn_all_transactions.png')
plt.show()