In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime

# Pre-Processing

In [2]:
# Importing datasets

customers_df = pd.read_csv('https://raw.githubusercontent.com/moonpay/data-science-challenge/master/data/customers.csv')
transactions_df = pd.read_csv('https://raw.githubusercontent.com/moonpay/data-science-challenge/master/data/transactions.csv', 
                              dtype={"id": str, "transaction_amount_usd": float, "fiat_currency": str,
                                    "crypto_currency": str, "transaction_type": str, "failure_reason": str, "transaction_status": str,
                                    "transaction_device": str, "transaction_browser": str, "card_id": str, "card_type": str,
                                    "card_country": str, "is_transaction_fraud": bool})
tx_customers_df = pd.read_csv('https://raw.githubusercontent.com/moonpay/data-science-challenge/master/data/transactions_customers.csv')

In [3]:
# Renaming columns to simplify joins

customers_df = customers_df.rename(columns={"id": "customer_id"})
transactions_df = transactions_df.rename(columns={"id": "transaction_id"})

In [4]:
# Joining datasets together into a single dataframe

new_df = pd.merge(tx_customers_df, customers_df, how="left", on="customer_id")
new_df = pd.merge(new_df, transactions_df, how="left", on="transaction_id")
new_df = new_df.sort_values(by=["transaction_time"])

# Forcing timestamp fields into correct data format

new_df[['customer_created_at', 'transaction_time']] = new_df[['customer_created_at', 'transaction_time']].apply(pd.to_datetime)

# 3791 transactions in dataset with negative $ amount transactions 
# forcing to absolute values as this should not be possible
new_df['transaction_amount_usd'] = abs(new_df['transaction_amount_usd'])

# Dataset Analysis and Trends

In [5]:
new_df

Unnamed: 0,transaction_id,customer_id,customer_created_at,customer_country,customer_state,year_of_birth,transaction_time,transaction_amount_usd,fiat_currency,crypto_currency,transaction_type,failure_reason,transaction_status,transaction_device,transaction_browser,card_id,card_type,card_country,is_transaction_fraud
37247,40fe2b35-f143-4033-b705-2e59afba8d30,8a459a27-ba00-4636-b4c5-3b2e60c2b2d9,2021-04-27 08:08:05+00:00,FRA,,1973.0,2021-05-01 01:35:12+00:00,107.221914,EUR,ETH,CARD,,COMPLETED,ANDROID MOBILE,CHROME,bffd36bf-f585-497b-a31b-4da593b23982,DEBIT,IRL,False
36565,049f20ee-f64c-4b24-92eb-da3cc345a219,f97a5071-e30f-4915-bc2a-02f8fef155d9,2021-04-25 10:56:40+00:00,USA,ND,1979.0,2021-05-01 02:04:23+00:00,277.084124,USD,XLM,CARD,,COMPLETED,IPHONE,SAFARI,3fd817fc-8fcc-40aa-9808-fef450fa65bc,DEBIT,USA,False
31539,ac8c6b48-49c9-4b22-a744-a0f141a4bc3a,a3de4909-b759-455b-9209-5475ba813df5,2021-04-24 20:13:23+00:00,CAN,ON,1981.0,2021-05-01 02:06:31+00:00,994.890354,USD,BTC,CARD,CARD DECLINED,FAILED,ANDROID MOBILE,CHROME,eeb8d3b0-2bb7-4a49-8798-27e43ee483cd,DEBIT,CAN,False
36418,5e170ca7-f51c-4718-ac28-f78a0f32d402,2b67c504-dc8c-4d19-8ec4-81cb856c7f9a,2021-04-22 21:27:38+00:00,USA,SD,1989.0,2021-05-01 02:12:45+00:00,100.949269,EUR,USDT,CARD,,COMPLETED,IPHONE,SAFARI,7ff0cf0d-1aaa-4c30-b962-27adf4760b89,DEBIT,USA,False
36211,ce08fee6-9b6f-4082-b9dd-e3f356f033b2,96854e26-969a-4336-af06-c3cc415af341,2021-04-27 12:07:01+00:00,USA,CA,1995.0,2021-05-01 02:19:24+00:00,14.556417,USD,BTC,CARD,,COMPLETED,DESKTOP,CHROME,b3a1dc32-9d17-4624-80c1-7dc05d91836d,DEBIT,USA,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15555,7e95db53-6f23-4d26-8c43-bdea47e8e58c,b0b5d4ae-76cb-40f5-92ff-c1eb999fcb73,2021-04-23 23:45:27+00:00,CAN,ON,1997.0,2021-06-02 22:04:24+00:00,20.598635,USD,BTC,CARD,,FAILED,DESKTOP,CHROME,679032d1-4965-4f29-8465-db9c73a706c6,DEBIT,USA,False
55718,4d7a5ba7-8c87-46e4-9cb3-095f83102304,313e1c30-7711-4ea8-a606-ccc61e608baf,2021-05-21 22:25:15+00:00,USA,FL,1997.0,2021-06-02 22:23:21+00:00,982.039618,USD,ETH,CARD,,COMPLETED,ANDROID MOBILE,CHROME,98770fc4-d835-48b3-bb19-dfe3423a50cd,DEBIT,USA,False
55676,a4ff6085-64f0-4831-93ca-5eb71cb279f7,6a25eb46-1299-44af-8728-79911ad1a772,2021-05-02 08:14:44+00:00,GBR,,1987.0,2021-06-02 22:31:30+00:00,93.676025,GBP,DOGE,CARD,,COMPLETED,ANDROID MOBILE,OTHER,58038f3c-ff5a-4272-b072-510aa6e2b898,DEBIT,GBR,False
15464,3fa6ea9f-1675-4e3c-9990-3c08f482e38c,21688281-18d9-4f6f-9409-38a40efc4a52,2021-05-14 18:46:39+00:00,CAN,MB,1983.0,2021-06-02 22:34:49+00:00,216.221572,USD,BTC,CARD,,FAILED,ANDROID MOBILE,CHROME,ec0356e8-f4b5-48b5-b218-c076db228621,DEBIT,CAN,False


In [6]:
# Determining unique values in each column of dataframe

unique_values = new_df.nunique(dropna=False)
print(unique_values)

transaction_id            104674
customer_id                18865
customer_created_at        18773
customer_country               6
customer_state                62
year_of_birth                 70
transaction_time          102452
transaction_amount_usd    104674
fiat_currency                 11
crypto_currency               11
transaction_type               2
failure_reason                 4
transaction_status             2
transaction_device             5
transaction_browser            5
card_id                    26231
card_type                      3
card_country                  61
is_transaction_fraud           2
dtype: int64


In [7]:
new_df.describe()

Unnamed: 0,year_of_birth,transaction_amount_usd
count,104664.0,104674.0
mean,1985.708907,414.137628
std,11.902749,733.429817
min,1933.0,0.000305
25%,1979.0,61.171117
50%,1988.0,156.245638
75%,1995.0,457.885704
max,2003.0,11983.005305


In [8]:
# Cross Tabulation to understand correlation between column values and likelihood of fraud
def tabulator(df):

    # Drop columns with high number of unique values
    df = df.drop(['transaction_id', 'customer_id', 'customer_created_at', 'transaction_time',
                 'transaction_amount_usd', 'card_id'], axis=1)

    for feature in df:
    
        tab = pd.crosstab(
        index=df[feature],
        columns=df['is_transaction_fraud'])

        print(tab)

In [9]:
completed_df = new_df.loc[new_df['transaction_status'] == 'COMPLETED']

In [10]:
tabulator(completed_df)

is_transaction_fraud  False  True
customer_country                 
CAN                    7267     5
ESP                     950    16
FRA                    4125    20
GBR                   14536    67
USA                   43729  1507
is_transaction_fraud  False  True
customer_state                   
AB                     1044     1
AK                      152     0
AL                      776     5
AR                      299     5
AZ                     1079    62
...                     ...   ...
WA                     1356   151
WI                      450     3
WV                       40     3
WY                      111     3
YT                        1     0

[61 rows x 2 columns]
is_transaction_fraud  False  True
year_of_birth                    
1933.0                   26     0
1934.0                    6     0
1936.0                   23    19
1938.0                   11     0
1939.0                    4     2
...                     ...   ...
1999.0                 19

In [11]:
fraud_df = completed_df.loc[new_df['is_transaction_fraud'] == True]
non_fraud_df = completed_df.loc[new_df['is_transaction_fraud'] == False]

In [12]:
# Summary stats on transaction size for fraud, non fraud and all transactions

mean_fraud_tx_vol = fraud_df['transaction_amount_usd'].mean()
median_fraud_tx_vol = fraud_df['transaction_amount_usd'].median()
mean_non_fraud_tx_vol = non_fraud_df['transaction_amount_usd'].mean()
median_non_fraud_tx_vol = non_fraud_df['transaction_amount_usd'].median()
mean_all_tx = new_df['transaction_amount_usd'].mean()
median_all_tx = new_df['transaction_amount_usd'].median()

print('mean fraud tx vol: ', mean_fraud_tx_vol)
print('median fraud tx vol: ', median_fraud_tx_vol)
print('mean non fraud tx vol: ', mean_non_fraud_tx_vol)
print('median non fraud tx vol: ', median_non_fraud_tx_vol)
print('mean all tx: ', mean_all_tx)
print('median all tx: ', median_all_tx)

mean fraud tx vol:  569.6185234918778
median fraud tx vol:  434.8741018012128
mean non fraud tx vol:  327.6322635899929
median non fraud tx vol:  123.28583797394154
mean all tx:  414.1376275482916
median all tx:  156.2456377174519


In [13]:
# Summary stats on customer DOB for fraud, non fraud and all transactions

mean_dob_fraud = fraud_df['year_of_birth'].mean()
mean_dob_non_fraud = non_fraud_df['year_of_birth'].mean()
mean_dob_all_tx = new_df['year_of_birth'].mean()

print('mean dob fraud: ', mean_dob_fraud)
print('mean dob non fraud: ', mean_dob_non_fraud)
print('mean dob all tx: ', mean_dob_all_tx)

mean dob fraud:  1984.7919504643962
mean dob non fraud:  1985.6693531253097
mean dob all tx:  1985.7089065963464


# Dataset Observations:


- Over 32k transactions (~30% of dataset) are incomplete and thus cannot be used for test/train data (though can be used to compute other features)
- Dataset is heavily skewed towards fraud coming from USA - both in terms of customer country, card country, and fiat source
- Dataset is heavily skewed towards fraud coming from BTC purchases
- Cards are much riskier than Apple Pay
- Minimal difference in risk between using mobile vs non mobile device options
- Firefox safest browser
- No real difference in risk between using credit or debit card
- Only card countries that have fraudulent transactions are those that have a corresponding user country within the dataset - using a card from outside US/CA/GB/FR/ES has a fraud rate of 0%
- No real difference between fraud/non fraud transactions in terms of mean/median user age - both centre around 1985, ~35 to 36 years old at time of transactions

# Highest Risk Customer Profile

- USA user, based in Washington State
- USA card country
- Transacting in USD
- Buying BTC
- Spending above both the mean and the median for all transaction volumes
- Using a card not apple pay
- Card is a debit card
- Using chrome or safari

# Feature Engineering Functions

In [14]:
def card_country_mismatch(row):
    
    """
    
    Returns a boolean value based on whether the user's country matches the country of their payment method
    
    """
    
    if row['customer_country'] != row['card_country']:
        return 1
    else:
        return 0

In [15]:
def account_age_at_transfer(row):
    
    """
    
    Returns the time difference, in days, between the date a user signs up, and the date a transaction is made.
    
    """
    
    account_age = (row['transaction_time'] - row['customer_created_at']) / np.timedelta64(1, 'D')
    
    return account_age

In [16]:
def customer_age_at_transfer(row):
    
    """
    
    Returns a user's age, in years, at the time of transaction.
    
    """
    
    customer_age = row['transaction_time'].year - row['year_of_birth']
    
    return customer_age

In [17]:
def completed_tx(value):
    
    """
    
    Returns a value of 1 if a transaction is completed, and a value of 0 if failed; utilised later to calculate
    user transaction history.
    
    """
    
    if value == 'COMPLETED':
        return 1
    else:
        return 0

In [18]:
def failed_tx(value):
    
    """
    
    Returns a value of 1 if a transaction fails, and a value of 0 if it is successful; utilised later to calculate
    user transaction history.
    
    """
    
    if value == 'FAILED':
        return 1
    else:
        return 0

In [19]:
def card_declines(value):
    
    """
    
    Returns a value of 1 if a transaction fails as a result of a card decline, and a value of 0 if it is successful; utilised later to calculate
    user transaction history.
    
    """
    
    if value == 'CARD DECLINED':
        return 1
    else:
        return 0

In [20]:
def data_leakage_cleanup(value):
    
    """
    
    Function designed to operate on the transaction history related functions outlined above.
    Subtracts 1 from the current transaction's values for each field, as information on whether a transaction
    is successful/fails is not available at the time of transaction, only prior history is available.
    
    Example: For a transaction that is the 6th successful completed transaction, we are only aware of the 5
    prior successes at the time of processing/decision making on whether to accept transaction - the outcome is
    a future data point.
    
    Sets a floor of 0 for each field, as it is not possible to have negative values for transaction history fields.
    
    """
    
    if value == 0:
        return value
    else:
        return value - 1

In [21]:
def boolean_conversion(value):
    
    if value == True:
        return 1
    else:
        return 0

# Dataframe Cleaning Functions

In [22]:
def dataset_formatter(df):
    
    """
    
    Function designed to make necessary changes to the base dataframes and generate the new features by
    manipulating existing datapoints into new information.
    
    """

    df['card_country_mismatch'] = df.apply(lambda row: card_country_mismatch(row), axis=1)
    df['account_age_at_transfer'] = df.apply(lambda row: account_age_at_transfer(row), axis=1)
    df['customer_age'] = df.apply(lambda row: customer_age_at_transfer(row), axis=1)
    
    # Combining device and browser into a single concatenated feature due to inherent links between them,
    # and to reduce the number of columns requiring one hot encoding.
    # Also combining transaction type + card type into single feature
    
    df['device_browser_combo'] = df['transaction_device'] + " - " + df['transaction_browser']
    df['payment_method_type'] = df['transaction_type'] + " - " + df['card_type']
    
    # Features to compare user volume against wider cohort based on recent prior transactions.
    # Smaller window is used for country level feature due to reduced sample size.
    # rolling averages are computed using the parameter closed=left to ensure most recent data point (current tx)
    # is not included in the window to avoid data leakage
    
    df['rolling_avg_100_tx'] = df['transaction_amount_usd'].rolling(100, min_periods=100, closed='left').mean()
    df['avg_volume_by_country_10'] = df.sort_values(by=['transaction_time'], ascending=True).groupby('customer_country')['transaction_amount_usd'].rolling(10, min_periods = 10, closed='left').mean().reset_index(drop=True, level=0)
    df['transaction_vol_multiple'] = df['transaction_amount_usd'] / df['rolling_avg_100_tx']
    df['country_transaction_vol_multiple'] = df['transaction_amount_usd'] / df['avg_volume_by_country_10']
    
    df['completed_tx'] = df['transaction_status'].apply(completed_tx)
    df['failed_tx'] = df['transaction_status'].apply(failed_tx)
    df['card_declined'] = df['failure_reason'].apply(card_declines)
    df['n_complete_tx'] = df.sort_values(by=['transaction_time'], ascending=True).groupby('customer_id')['completed_tx'].cumsum()
    df['n_failed_tx'] = df.sort_values(by=['transaction_time'], ascending=True).groupby('customer_id')['failed_tx'].cumsum()
    df['n_card_declines'] = df.sort_values(by=['transaction_time'], ascending=True).groupby('customer_id')['card_declined'].cumsum()
    
    # Features on card ID level - possible for both users to change to new payment methods,
    # and for payment methods to be shared across users
    
    df['card_id_complete_tx'] = df.sort_values(by=['transaction_time'], ascending=True).groupby('card_id')['completed_tx'].cumsum()
    df['card_id_failed_tx'] = df.sort_values(by=['transaction_time'], ascending=True).groupby('card_id')['failed_tx'].cumsum()
    df['card_id_declines'] = df.sort_values(by=['transaction_time'], ascending=True).groupby('card_id')['card_declined'].cumsum()
    
    # Ensures that only past information is used to predict the current transaction
    df['n_complete_tx'] = df['n_complete_tx'].apply(data_leakage_cleanup)
    df['n_failed_tx'] = df['n_failed_tx'].apply(data_leakage_cleanup)
    df['n_card_declines'] = df['n_card_declines'].apply(data_leakage_cleanup)
    df['card_id_complete_tx'] = df['card_id_complete_tx'].apply(data_leakage_cleanup)
    df['card_id_failed_tx'] = df['card_id_failed_tx'].apply(data_leakage_cleanup)
    df['card_id_declines'] = df['card_id_declines'].apply(data_leakage_cleanup)
    
    
    # Convert true false values into binary
    df['is_transaction_fraud'] = df['is_transaction_fraud'].apply(boolean_conversion)
    
    return df

In [23]:
def extract_labels(df):
    
    """
    
    Extracts the outcome from the column is_transaction_fraud to a new dataframe for use in modelling.
    
    """
    
    y = df['is_transaction_fraud']
    
    return y

In [24]:
def dataset_dropper(df):

    """
    Drops unnecessary columns from the dataframe that will not be used as features (including those
    where value has been duplicated in another column (e.g. age vs year of birth).
    
    Removes records where the transaction did not succeed, as these should not be used for modelling, as there
    is no way to determine if they would have been actually fraudulent or not - though they can be used
    for feature computation elsewhere prior to removal.
    
    Removes values too early in the dataset to return values for features that require computing rolling averages.
    
    """
    
    df = df.drop(['transaction_id', 'customer_id', 'year_of_birth', 'customer_created_at',
                 'transaction_time', 'completed_tx', 'failed_tx', 'card_declined',
                  'transaction_type', 'card_type',
                  'transaction_device', 'transaction_browser',
                  'card_id', 'rolling_avg_100_tx', 'avg_volume_by_country_10',                  
                  ], axis=1)
    
    df = df.drop(df[df.transaction_status == 'FAILED'].index)
    df = df.drop(['failure_reason', 'transaction_status'], axis=1)
    df = df[df['transaction_vol_multiple'].notna()]
    df = df[df['country_transaction_vol_multiple'].notna()]
    df = df[df['customer_age'].notna()] # removes small number of rows where field returns NaN
    
    return df

In [25]:
def dummy_formatter(df):
    
    df = pd.get_dummies(df, columns=['fiat_currency', 'crypto_currency', 'card_country', 'device_browser_combo', 'payment_method_type'], 
                        prefix=['fiat', 'crypto', 'card_country', 'db', 'pm'], 
                        dummy_na=False)
    df = pd.get_dummies(df, columns=['customer_country', 'customer_state'],
                        prefix=['customer_country','state'],
                        dummy_na=True)
    
    return df

In [26]:
new_df = dataset_formatter(new_df)
new_df = dataset_dropper(new_df)
new_df

Unnamed: 0,customer_country,customer_state,transaction_amount_usd,fiat_currency,crypto_currency,card_country,is_transaction_fraud,card_country_mismatch,account_age_at_transfer,customer_age,device_browser_combo,payment_method_type,transaction_vol_multiple,country_transaction_vol_multiple,n_complete_tx,n_failed_tx,n_card_declines,card_id_complete_tx,card_id_failed_tx,card_id_declines
36534,USA,TX,65.697968,USD,XLM,USA,0,0,6.794815,32.0,ANDROID MOBILE - CHROME,CARD - DEBIT,0.154592,0.167987,0,0,0,0,0,0
37000,CAN,QC,202.495435,USD,BNB,CAN,0,0,0.770671,31.0,IPHONE - SAFARI,CARD - CREDIT,0.476951,0.533630,0,0,0,0,0,0
36701,GBR,,594.939468,USD,BNB,GBR,0,0,6.557257,42.0,IPHONE - SAFARI,CARD - DEBIT,1.437107,1.695361,0,0,0,0,0,0
37021,USA,IL,216.266991,USD,ETH,USA,0,0,9.841551,34.0,IPHONE - SAFARI,CARD - DEBIT,0.515799,0.552401,0,0,0,0,0,0
36370,USA,MI,41.478257,USD,USDT,USA,0,0,9.912998,29.0,DESKTOP - SAFARI,CARD - DEBIT,0.098669,0.100613,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56203,FRA,,168.790865,USD,BNB,FRA,0,0,31.105046,48.0,ANDROID MOBILE - CHROME,CARD - DEBIT,0.466553,0.363123,2,0,0,2,0,0
55302,USA,TX,932.935655,USD,BTC,USA,0,0,9.091447,47.0,ANDROID MOBILE - CHROME,CARD - DEBIT,2.733744,1.526336,12,12,9,12,12,9
55718,USA,FL,982.039618,USD,ETH,USA,0,0,11.998681,24.0,ANDROID MOBILE - CHROME,CARD - DEBIT,2.814806,1.449653,10,5,3,10,5,3
55676,GBR,,93.676025,GBP,DOGE,GBR,0,0,31.594977,34.0,ANDROID MOBILE - OTHER,CARD - DEBIT,0.262097,0.672390,17,15,9,17,15,9


In [27]:
# Extracting labels into new dataframe and dropping from dataset to be used for modelling. Adding dummy variables
# for categorical data

labels = extract_labels(new_df)
pre_dummy_df = new_df.drop(['is_transaction_fraud'], axis=1)
final_df = dummy_formatter(pre_dummy_df)
final_df

Unnamed: 0,transaction_amount_usd,card_country_mismatch,account_age_at_transfer,customer_age,transaction_vol_multiple,country_transaction_vol_multiple,n_complete_tx,n_failed_tx,n_card_declines,card_id_complete_tx,...,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,state_YT,state_nan
36534,65.697968,0,6.794815,32.0,0.154592,0.167987,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
37000,202.495435,0,0.770671,31.0,0.476951,0.533630,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36701,594.939468,0,6.557257,42.0,1.437107,1.695361,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
37021,216.266991,0,9.841551,34.0,0.515799,0.552401,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36370,41.478257,0,9.912998,29.0,0.098669,0.100613,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56203,168.790865,0,31.105046,48.0,0.466553,0.363123,2,0,0,2,...,0,0,0,0,0,0,0,0,0,1
55302,932.935655,0,9.091447,47.0,2.733744,1.526336,12,12,9,12,...,1,0,0,0,0,0,0,0,0,0
55718,982.039618,0,11.998681,24.0,2.814806,1.449653,10,5,3,10,...,0,0,0,0,0,0,0,0,0,0
55676,93.676025,0,31.594977,34.0,0.262097,0.672390,17,15,9,17,...,0,0,0,0,0,0,0,0,0,1


In [28]:
final_df = final_df.reset_index(drop=True)
labels = labels.reset_index(drop=True)

In [29]:
# Showing all variables in final dataframe ready for modelling after one hot encoding with dummy variables
# for categorical features

column_list = final_df.columns.tolist()
column_list

['transaction_amount_usd',
 'card_country_mismatch',
 'account_age_at_transfer',
 'customer_age',
 'transaction_vol_multiple',
 'country_transaction_vol_multiple',
 'n_complete_tx',
 'n_failed_tx',
 'n_card_declines',
 'card_id_complete_tx',
 'card_id_failed_tx',
 'card_id_declines',
 'fiat_AUD',
 'fiat_BGN',
 'fiat_CAD',
 'fiat_CHF',
 'fiat_EUR',
 'fiat_GBP',
 'fiat_HKD',
 'fiat_RON',
 'fiat_TRY',
 'fiat_TWD',
 'fiat_USD',
 'crypto_BCH',
 'crypto_BNB',
 'crypto_BTC',
 'crypto_DOGE',
 'crypto_ETH',
 'crypto_LTC',
 'crypto_TRX',
 'crypto_USDC',
 'crypto_USDT',
 'crypto_XLM',
 'crypto_XRP',
 'card_country_AFG',
 'card_country_ALB',
 'card_country_ARE',
 'card_country_AUS',
 'card_country_AUT',
 'card_country_BEL',
 'card_country_BGR',
 'card_country_BHR',
 'card_country_BLR',
 'card_country_BMU',
 'card_country_CAN',
 'card_country_CHE',
 'card_country_CHL',
 'card_country_COL',
 'card_country_CYM',
 'card_country_CZE',
 'card_country_DEU',
 'card_country_ESP',
 'card_country_EST',
 'car

# Modelling

## Classifiers

In [30]:
# Generating classifiers

logreg = LogisticRegression(max_iter=10000)
knn = KNeighborsClassifier(n_neighbors=5)
forest = RandomForestClassifier()

In [31]:
def run_model(model, x, y):
    
    """
    
    Receives a model configuration, and generates a test train split before running the model. Returns
    summary statistics about model performance, as well as the model predictions, and how confident the
    model was when making that prediction.
    
    """
    
    
    # Min Max Scaling numeric columns
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    
    # Shuffle is set to false because model is time bounded - cannot use future transactions to predict those from the past
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle=False)
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_pred_prob = model.predict_proba(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    predictions = []
    for i in y_pred_prob:
        predictions.append(i[1])
    
    
    # Pull feature importance values for algorithms that support this
    if model == logreg:
        importances = model.coef_
        return accuracy, precision, recall, f1, auc, y_pred, y_test, predictions, importances
    elif model == forest:
        importances = model.feature_importances_
        return accuracy, precision, recall, f1, auc, y_pred, y_test, predictions, importances
    else:
        return accuracy, precision, recall, f1, auc, y_pred, y_test, predictions

In [32]:
def show_stats(results):
    
    """
    
    Prints out summary statistics obtained from run_model() function.
    
    """
    
    print("Accuracy:", results[0])
    print("Precision:", results[1])
    print("Recall", results[2])
    print("F1 Score:", results[3])
    print("AUC:", results[4])

In [33]:
def prediction_df(results):
    
    df = pd.DataFrame(data={'predicted': results[5], 'actual': results[6], 'prediction_confidence': results[7]})
    
    return df

# Logistic Regression

In [34]:
regression_model = run_model(logreg, final_df, labels)

In [35]:
importances = regression_model[8]

In [36]:
importances = importances[0]

In [37]:
logreg_importance = pd.DataFrame({'feature': column_list, 'coefficient': importances})
logreg_importance['absolute_coefficient'] = abs(logreg_importance['coefficient'])
logreg_importance = logreg_importance.sort_values(by='absolute_coefficient', ascending=False)
logreg_importance

Unnamed: 0,feature,coefficient,absolute_coefficient
2,account_age_at_transfer,-6.611448,6.611448
6,n_complete_tx,2.628407,2.628407
25,crypto_BTC,2.515608,2.515608
157,state_OR,2.476048,2.476048
17,fiat_GBP,2.145253,2.145253
...,...,...,...
79,card_country_RKS,-0.000008,0.000008
69,card_country_MYS,0.000005,0.000005
113,customer_country_nan,0.000000,0.000000
20,fiat_TRY,0.000000,0.000000


In [38]:
show_stats(regression_model)

Accuracy: 0.9677665326493831
Precision: 0.30120481927710846
Recall 0.05787037037037037
F1 Score: 0.09708737864077671
AUC: 0.526862868478025


In [39]:
predictions = prediction_df(regression_model)

In [40]:
# Displaying predictions vs actual for transactions in test set that were actually fraud,
# with prediction confidence that transaction is fraudulent;
# For most transactions, model displays high confidence that transactions are not fraud == high false negative rate
# and poor performance.

with pd.option_context('display.max_rows', None):
    df = predictions.loc[predictions['actual'] == 1]
    print(df)

       predicted  actual  prediction_confidence
57720          0       1               0.165397
57822          0       1               0.159203
57853          0       1               0.054143
57859          0       1               0.426836
57867          0       1               0.171792
57870          0       1               0.180482
57909          0       1               0.208864
57928          0       1               0.026813
57981          0       1               0.186591
57982          0       1               0.129817
58066          0       1               0.080166
58079          0       1               0.090876
58087          0       1               0.119376
58096          0       1               0.454126
58101          0       1               0.008828
58163          0       1               0.124174
58219          0       1               0.206162
58389          0       1               0.142819
58391          0       1               0.113810
58419          0       1               0

# K Nearest Neighbours

In [41]:
knn_model = run_model(knn, final_df, labels)

In [42]:
show_stats(knn_model)

Accuracy: 0.9688756412033828
Precision: 0.4405594405594406
Recall 0.14583333333333334
F1 Score: 0.21913043478260874
AUC: 0.5700582987947216


In [43]:
knn_pred = prediction_df(knn_model)

In [44]:
"""

Model performs slightly better versus Linear Regression. Still performs poorly and allows too many false negatives.
Potentially could be improved by increasing the number of neighbours used to form classification.

"""

with pd.option_context('display.max_rows', None):
    df = knn_pred.loc[knn_pred['actual'] == 1]
    print(df)

       predicted  actual  prediction_confidence
57720          1       1                    1.0
57822          1       1                    1.0
57853          0       1                    0.2
57859          1       1                    1.0
57867          1       1                    1.0
57870          1       1                    1.0
57909          0       1                    0.2
57928          0       1                    0.0
57981          1       1                    1.0
57982          0       1                    0.2
58066          0       1                    0.4
58079          0       1                    0.0
58087          0       1                    0.2
58096          1       1                    0.8
58101          0       1                    0.2
58163          0       1                    0.2
58219          0       1                    0.2
58389          0       1                    0.0
58391          0       1                    0.2
58419          0       1                

# Random Forest

In [45]:
forest_model = run_model(forest, final_df, labels)

In [46]:
show_stats(forest_model)

Accuracy: 0.9737279911271316
Precision: 0.9491525423728814
Recall 0.12962962962962962
F1 Score: 0.2281059063136456
AUC: 0.5647076260196169


In [47]:
forest_pred = prediction_df(forest_model)

In [48]:
"""

Model performs better again than prior algorithms - precision is significantly better. 
Still performs poorly and allows too many false negatives as denoted by low level of recall.

"""

with pd.option_context('display.max_rows', None):
    df = forest_pred.loc[forest_pred['actual'] == 1]
    print(df)

       predicted  actual  prediction_confidence
57720          1       1                   0.73
57822          1       1                   0.57
57853          0       1                   0.38
57859          1       1                   0.79
57867          0       1                   0.26
57870          0       1                   0.29
57909          1       1                   0.63
57928          0       1                   0.11
57981          0       1                   0.23
57982          0       1                   0.42
58066          0       1                   0.10
58079          0       1                   0.06
58087          0       1                   0.50
58096          0       1                   0.42
58101          0       1                   0.11
58163          0       1                   0.20
58219          0       1                   0.05
58389          0       1                   0.07
58391          1       1                   0.57
58419          0       1                

In [49]:
importances = forest_model[8]
importances = importances

In [50]:
forest_importance = pd.DataFrame({'feature': column_list, 'coefficient': importances})
forest_importance['absolute_coefficient'] = abs(forest_importance['coefficient'])
forest_importance = forest_importance.sort_values(by='absolute_coefficient', ascending=False)
forest_importance

Unnamed: 0,feature,coefficient,absolute_coefficient
2,account_age_at_transfer,0.122068,0.122068
0,transaction_amount_usd,0.090076,0.090076
3,customer_age,0.081105,0.081105
4,transaction_vol_multiple,0.077600,0.077600
5,country_transaction_vol_multiple,0.072233,0.072233
...,...,...,...
48,card_country_CYM,0.000000,0.000000
113,customer_country_nan,0.000000,0.000000
43,card_country_BMU,0.000000,0.000000
42,card_country_BLR,0.000000,0.000000
