In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import time

<font size=5> Load data </font>

In [None]:
JPfraud_full = pd.read_csv("../data/fraud_payment_data")
JPfraud_full

In [None]:
# Converting Time_step to a datetime type
JPfraud_full = JPfraud_full.drop(columns=['Sender_lob'],axis=1)
JPfraud_full['timestamp'] = pd.to_datetime(JPfraud_full['Time_step'],format='%Y-%m-%d %H:%M:%S')
JPfraud_full = JPfraud_full.drop(columns=['Time_step'])
JPfraud_full.insert(0,'timestamp',JPfraud_full.pop('timestamp'))
JPfraud_full

In [None]:
# Truncating data for ease and for code to run quicker
JPfraud = JPfraud_full[:500000]
JPfraud

In [None]:
# Function to convert a time object to seconds after midnight
# (pandas will promote int to float due to NaNs in the columns fyi)
def convert_time_to_seconds(timeObj):
    return int((timeObj.hour)*60*60 + (timeObj.minute)*60 + (timeObj.second))

<font size=6> Time between transactions feature </font>

In [None]:
# Calculates time between previous transaction for each of seller, beneficiary
# (The mask.....etc code sets the sender_time_btwn, bene_time_btwn feature to 0 (of timedelta type)
# for the first occurrence an account appears. NaT's are still placed where a Sender_Account or
# Bene_Account entry is NaN.)

JPfraud['sender_time_btwn'] = (JPfraud.groupby('Sender_Account')['timestamp'].diff()
                                      .mask(JPfraud.groupby('Sender_Account')['timestamp'].cumcount().eq(0), dt.timedelta(0))
                              )
JPfraud['bene_time_btwn'] = (JPfraud.groupby('Bene_Account')['timestamp'].diff()
                                    .mask(JPfraud.groupby('Bene_Account')['timestamp'].cumcount().eq(0), dt.timedelta(0))
                            )

JPfraud

<font size=6> Expanding time features </font> <br>
Stats are updated at time of each transaction and represent entire history

In [None]:
# Gives minimum number of observations before assigning entries
min_obs = 3

In [None]:
# Suppresses pandas' warnings about chained assignment
pd.options.mode.chained_assignment = None

# Choosing quantile percentages for time differences between transactions
lower_time_diff_quantile = 0.1
upper_time_diff_quantile = 0.9

# Expanding quantiles for times between consecutive transactions (in total seconds)
JPfraud['sender_time_diff_min'] = (JPfraud.groupby('Sender_Account')['sender_time_btwn']
                                          .apply(lambda x: x.dt.total_seconds().expanding(method='single',min_periods=min_obs)
                                                            .quantile(q=lower_time_diff_quantile,interpolation='midpoint'))
                                          .reset_index(level=0,drop=True)
                                  )

JPfraud['sender_time_diff_max'] = (JPfraud.groupby('Sender_Account')['sender_time_btwn']
                                          .apply(lambda x: x.dt.total_seconds().expanding(method='single',min_periods=min_obs)
                                                            .quantile(q=upper_time_diff_quantile,interpolation='midpoint'))
                                          .reset_index(level=0,drop=True)
                                  )

JPfraud['bene_time_diff_min'] = (JPfraud.groupby('Bene_Account',sort=False)['bene_time_btwn']
                                        .apply(lambda x: x.dt.total_seconds().expanding(method='single',min_periods=min_obs)
                                                          .quantile(q=lower_time_diff_quantile,interpolation='midpoint'))
                                        .reset_index(level=0,drop=True)
                                )

JPfraud['bene_time_diff_max'] = (JPfraud.groupby('Bene_Account',sort=False)['bene_time_btwn']
                                        .apply(lambda x: x.dt.total_seconds().expanding(method='single',min_periods=min_obs)
                                                              .quantile(q=upper_time_diff_quantile,interpolation='midpoint'))
                                        .reset_index(level=0,drop=True)
                                )

# outside_sender_time_diff_range, outside_bene_time_diff_range: Classifying whether time between consecutive
# transaction is outside of the quantile range above
JPfraud = JPfraud.assign(outside_sender_time_diff_range = lambda x: (
            (x.sender_time_btwn.dt.total_seconds() < x.sender_time_diff_min) | (x.sender_time_btwn.dt.total_seconds() > x.sender_time_diff_max))*1)
JPfraud = JPfraud.assign(outside_bene_time_diff_range = lambda x: (
            (x.bene_time_btwn.dt.total_seconds() < x.bene_time_diff_min) | (x.bene_time_btwn.dt.total_seconds() > x.bene_time_diff_max))*1)

JPfraud

In [None]:
pd.options.mode.chained_assignment = None

# Choosing quantile percentages
lower_time_quantile = 0.15
upper_time_quantile = 0.85

# seconds_in_day: time transaction occurred in seconds after midnight
JPfraud['seconds_in_day'] = JPfraud.timestamp.transform(lambda x: convert_time_to_seconds(x.time()))

# time_min_out: expanding lower quantile for range of time of SENDER transactions (in seconds after midnight)
JPfraud['time_min_out'] = (JPfraud.groupby('Sender_Account',sort=False)['seconds_in_day']
                                  .apply(lambda x: x.expanding(method='single',min_periods=min_obs)
                                                    .quantile(q=lower_time_quantile,interpolation='midpoint'))
                                  .reset_index(level=0,drop=True)
                          )

# time_max_out: expanding upper quantile for range of time of SENDER transactions (in seconds after midnight)
JPfraud['time_max_out'] = (JPfraud.groupby('Sender_Account',sort=False)['seconds_in_day']
                                  .apply(lambda x: x.expanding(method='single',min_periods=min_obs).quantile(
                                        q=upper_time_quantile))
                                  .reset_index(level=0,drop=True)
                          )

# time_min_in: expanding lower quantile for range of time of BENEFICIARY transactions (in seconds after midnight)
JPfraud['time_min_in'] = (JPfraud.groupby('Bene_Account',sort=False)['seconds_in_day']
                                 .apply(lambda x: x.expanding(method='single',min_periods=min_obs)
                                                   .quantile(q=lower_time_quantile,interpolation='midpoint'))
                                 .reset_index(level=0,drop=True)
                         )

# time_max_in: expanding upper quantile for range of time of BENEFICIARY transactions (in seconds after midnight)
JPfraud['time_max_in'] = (JPfraud.groupby('Bene_Account',sort=False)['seconds_in_day']
                                 .apply(lambda x: x.expanding(method='single',min_periods=min_obs)
                                                   .quantile(q=upper_time_quantile,interpolation='midpoint'))
                                 .reset_index(level=0,drop=True)
                         )

# outside_time_bene_range:   indicator of whether transaction is outside range for beneficiary at
#                            time of transaction, 0 no, 1 yes
JPfraud = JPfraud.assign(outside_time_bene_range = lambda x: ((x.seconds_in_day < x.time_min_in) | (x.seconds_in_day > x.time_max_in))*1)

# outside_time_sender_range: indicator of whether transaction is outside usual time range for sender
#                            at time of transaction, 0 no, 1 yes
JPfraud = JPfraud.assign(outside_time_sender_range = lambda x: ((x.seconds_in_day < x.time_min_out) | (x.seconds_in_day > x.time_max_out))*1)

JPfraud

<font size=6> Rolling time features </font> <br>
Stats are updated at time of each transaction, but only for a rolling window of time; for instance within the last 180 days. (These features are not  quite generating correctly.)

In [None]:
# Sets the rolling period for the rolling stat features
rolling_period = '180D' # 180 days

# Gives minimum number of observations before assigning entries
min_obs = 3

In [None]:
# Suppresses pandas' warnings about chained assignment
pd.options.mode.chained_assignment = None

# Choosing quantile percentages for time differences
lower_time_diff_quantile = 0.1
upper_time_diff_quantile = 0.9

# Rolling quantiles for times between consecutive transactions
JPfraud['rolling_sender_time_diff_min'] = (JPfraud.set_index('timestamp')
                                                  .groupby('Sender_Account')['sender_time_btwn']
                                                  .transform(lambda x: x.dt.total_seconds()
                                                                        .rolling(window=rolling_period,min_periods=min_obs)
                                                                        .quantile(q=lower_time_diff_quantile,interpolation='midpoint'))
                                                  .reset_index(level=0,drop=True)
                                          )

JPfraud['rolling_sender_time_diff_max'] = (JPfraud.set_index('timestamp').groupby('Sender_Account')['sender_time_btwn']
                                                  .transform(lambda x: x.dt.total_seconds()
                                                                        .rolling(window=rolling_period,min_periods=min_obs)
                                                                        .quantile(q=upper_time_diff_quantile,interpolation='midpoint'))
                                                  .reset_index(level=0,drop=True)
                                          )

JPfraud['rolling_bene_time_diff_min'] = (JPfraud.set_index('timestamp').groupby('Bene_Account')['bene_time_btwn']
                                                .transform(lambda x: x.dt.total_seconds()
                                                                      .rolling(window=rolling_period,min_periods=min_obs)
                                                                      .quantile(q=lower_time_diff_quantile,interpolation='midpoint'))
                                                .reset_index(level=0,drop=True)
                                        )

JPfraud['rolling_bene_time_diff_max'] = (JPfraud.set_index('timestamp').groupby('Bene_Account')['bene_time_btwn']
                                                .transform(lambda x: x.dt.total_seconds()
                                                                      .rolling(window=rolling_period,min_periods=min_obs)
                                                                      .quantile(q=upper_time_diff_quantile,interpolation='midpoint'))
                                                .reset_index(level=0,drop=True)
                                        )

# rolling_outside_sender_time_diff_range, rolling_outside_bene_time_diff_range: Classifying whether time between consecutive
#       transaction is outside of the quantile range above
JPfraud = JPfraud.assign(rolling_outside_sender_time_diff_range = lambda x: (
            (x.sender_time_btwn.dt.total_seconds() < x.rolling_sender_time_diff_min) | (x.sender_time_btwn.dt.total_seconds() > x.rolling_sender_time_diff_max))*1)
JPfraud = JPfraud.assign(rolling_outside_bene_time_diff_range = lambda x: (
            (x.bene_time_btwn.dt.total_seconds() < x.rolling_bene_time_diff_min) | (x.bene_time_btwn.dt.total_seconds() > x.rolling_bene_time_diff_max))*1)

JPfraud

In [None]:
# Suppresses pandas' warnings about chained assignment
pd.options.mode.chained_assignment = None

# Choosing quantile percentages
lower_time_quantile = 0.15
upper_time_quantile = 0.85

# seconds_in_day: time transaction occurred in seconds after midnight
JPfraud['seconds_in_day'] = JPfraud.timestamp.transform(lambda x: convert_time_to_seconds(x.time()))

# rolling_time_min_out: rolling lower quantile for range of time of SENDER transactions (in seconds after midnight)
JPfraud['rolling_time_min_out'] = (JPfraud.set_index('timestamp')
                                          .groupby('Sender_Account')['seconds_in_day']
                                          .transform(lambda x: x.rolling(window=rolling_period,min_periods=min_obs)
                                                                .quantile(q=lower_time_quantile,interpolation='midpoint'))
                                          .reset_index(level=0,drop=True)
                                  )

# rolling_time_max_out: rolling upper quantile for range of time of SENDER transactions (in seconds after midnight)
JPfraud['rolling_time_max_out'] = (JPfraud.set_index('timestamp')
                                          .groupby('Sender_Account')['seconds_in_day']
                                          .transform(lambda x: x.rolling(window=rolling_period,min_periods=min_obs)
                                                                .quantile(q=upper_time_quantile,interpolation='midpoint'))
                                          .reset_index(level=0,drop=True)
                                  )

# rolling_time_min_in: rolling lower quantile for range of time of BENEFICIARY transactions (in seconds after midnight)
JPfraud['rolling_time_min_in'] = (JPfraud.set_index('timestamp')
                                         .groupby('Bene_Account')['seconds_in_day']
                                         .transform(lambda x: x.rolling(window=rolling_period,min_periods=min_obs)
                                                               .quantile(q=lower_time_quantile,interpolation='midpoint'))
                                         .reset_index(level=0,drop=True)
                                 )

# rolling_time_max_in: rolling upper quantile for range of time of BENEFICIARY transactions (in seconds after midnight)
JPfraud['rolling_time_max_in'] = (JPfraud.set_index('timestamp')
                                         .groupby('Bene_Account')['seconds_in_day']
                                         .transform(lambda x: x.rolling(window=rolling_period,min_periods=min_obs)
                                                               .quantile(q=upper_time_quantile,interpolation='midpoint'))
                                         .reset_index(level=0,drop=True)
                                 )

# rolling_outside_time_bene_range:   indicator of whether transaction is outside range for beneficiary at
#                                    time of transaction, 0 no, 1 yes
JPfraud = JPfraud.assign(rolling_outside_time_bene_range = lambda x: ((x.seconds_in_day < x.time_min_in) | (x.seconds_in_day > x.time_max_in))*1)

# rolling_outside_time_sender_range: indicator of whether transaction is outside usual time range for sender
#                                    at time of transaction, 0 no, 1 yes
JPfraud = JPfraud.assign(rolling_outside_time_sender_range = lambda x: ((x.seconds_in_day < x.time_min_out) | (x.seconds_in_day > x.time_max_out))*1)

JPfraud

<font size=6> Bursts of Transactions feature </font>

In [None]:
# Window within to count the number of transactions
burst_period = '1D' # previous 24 hours/1 day

# Number of transactions within a previous window of time given by burst_period
JPfraud['sender_burst_num'] = (JPfraud.set_index('timestamp')
                                      .groupby('Sender_Account',group_keys=False)['Sender_Account']
                                      .apply(lambda x: x.rolling(window=burst_period).count())
                                      .reset_index(level=0,drop=True)
                              )

JPfraud['bene_burst_num'] = (JPfraud.set_index('timestamp')
                                    .groupby('Bene_Account',group_keys=False)['Bene_Account']
                                    .apply(lambda x: x.rolling(window=burst_period).count())
                                    .reset_index(level=0,drop=True)
                            )
JPfraud