In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import time

<font size=5> Load and clean data </font>

In [None]:
JPfraud_full = pd.read_csv("../data/fraud_payment_data")
JPfraud_full

In [None]:
# Converting Time_step to a datetime type
JPfraud_full = JPfraud_full.drop(labels=['Sender_Sector','Sender_lob'],axis=1)
JPfraud_full['Time_step'] = pd.to_datetime(JPfraud_full['Time_step'],format='%Y-%m-%d %H:%M:%S')
JPfraud_full

In [None]:
# Truncating data for ease and for code to run quicker
JPfraud = JPfraud_full[:500000]
JPfraud

In [None]:
# Function to convert a time object to seconds after midnight
# (pandas will promote int to float due to NaNs in the column fyi)
def convert_time_to_seconds(timeObj):
    return int((timeObj.hour)*60*60 + (timeObj.minute)*60 + (timeObj.second))

<font size=6> **Time features** </font>

In [None]:
# Calculates time between previous transaction for each of seller, beneficiary
# The mask.....etc code sets the sender_time_btwn or bene_time_btwn feature to 0 (of timedelta type)
# for the first occurrence an account appears (normally it's given NaN or NaT). NaT's are still placed
# where a Sender_Account or Bene_Account entry is NaN.

JPfraud['sender_time_btwn'] = JPfraud.groupby('Sender_Account')['Time_step'].diff().mask(JPfraud.groupby('Sender_Account')['Time_step'].cumcount().eq(0), dt.timedelta(0))
JPfraud['bene_time_btwn'] = JPfraud.groupby('Bene_Account')['Time_step'].diff().mask(JPfraud.groupby('Bene_Account')['Time_step'].cumcount().eq(0), dt.timedelta(0))

<font size=6> Rolling time features </font> <br>
Look at meeee. Hail Mary!

In [None]:
# Sets the rolling period for the rolling stat features
rolling_period = '180D' # 180 days

#  Gives minimum number of observations before assigning the entries
min_obs = 3

In [None]:
# Suppresses pandas' warnings about chained assignment, times between transactions
pd.options.mode.chained_assignment = None

# Rolling 90-day quantiles (15%, 85%) for times between consecutive transactions
JPfraud['rolling_sender_time_diff_min'] = JPfraud.set_index('Time_step').groupby(
                    'Sender_Account',sort=False)['sender_time_btwn'].transform(
                    lambda x: x.dt.total_seconds().rolling(rolling_period,min_periods=min_obs).quantile(q=0.1)).reset_index(level=0,drop=True)

JPfraud['rolling_sender_time_diff_max'] = JPfraud.set_index('Time_step').groupby(
                    'Sender_Account',sort=False)['sender_time_btwn'].transform(
                    lambda x: x.dt.total_seconds().rolling(rolling_period,min_periods=min_obs).quantile(q=0.9)).reset_index(level=0,drop=True)

JPfraud['rolling_bene_time_diff_min'] = JPfraud.set_index('Time_step').groupby(
                    'Bene_Account',sort=False)['bene_time_btwn'].transform(
                    lambda x: x.dt.total_seconds().rolling(rolling_period,min_periods=min_obs).quantile(q=0.1)).reset_index(level=0,drop=True)

JPfraud['rolling_bene_time_diff_max'] = JPfraud.set_index('Time_step').groupby(
                    'Bene_Account',sort=False)['bene_time_btwn'].transform(
                    lambda x: x.dt.total_seconds().rolling(rolling_period,min_periods=min_obs).quantile(q=0.9)).reset_index(level=0,drop=True)

# Classifying whether time between consecutive transaction is outside of range
JPfraud = JPfraud.assign(out_sender_time_diff_range = lambda x: (
            (x.sender_time_btwn.dt.total_seconds() < x.rolling_sender_time_diff_min) | (x.sender_time_btwn.dt.total_seconds() > x.rolling_sender_time_diff_max))*1)
JPfraud = JPfraud.assign(out_bene_time_diff_range = lambda x: (
            (x.bene_time_btwn.dt.total_seconds() < x.rolling_bene_time_diff_min) | (x.bene_time_btwn.dt.total_seconds() > x.rolling_bene_time_diff_max))*1)

In [None]:
pd.options.mode.chained_assignment = None
lower_time_quantile = 0.15
upper_time_quantile = 0.85

# seconds_in_day: time transaction occurred in seconds after midnight
JPfraud['seconds_in_day'] = JPfraud.Time_step.transform(lambda x: convert_time_to_seconds(x.time()))


# time_min_out: rolling lower quantile for range of time of SENDER transactions (in seconds after midnight)
JPfraud['time_min_out'] = JPfraud.set_index('Time_step').groupby(
                                'Sender_Account',sort=False)['seconds_in_day'].transform(
                                lambda x: x.rolling(rolling_period,min_periods=min_obs).quantile(q=lower_time_quantile)).reset_index(level=0,drop=True)

# time_max_out: rolling upper quantile for range of time of SENDER transactions (in seconds after midnight)
JPfraud['time_max_out'] = JPfraud.set_index('Time_step').groupby(
                                'Sender_Account',sort=False)['seconds_in_day'].transform(
                                lambda x: x.rolling(rolling_period,min_periods=min_obs).quantile(
                                q=upper_time_quantile)).reset_index(level=0,drop=True)

# time_min_in: rolling lower quantile for range of time of BENEFICIARY transactions (in seconds after midnight)
JPfraud['time_min_in'] = JPfraud.set_index('Time_step').groupby(
                                'Bene_Account',sort=False)['seconds_in_day'].transform(
                                lambda x: x.rolling(rolling_period,min_periods=min_obs).quantile(
                                q=lower_time_quantile)).reset_index(level=0,drop=True)

# time_max_in: rolling upper quantile for range of time of BENEFICIARY transactions (in seconds after midnight)
JPfraud['time_max_in'] = JPfraud.set_index('Time_step').groupby(
                                'Bene_Account',sort=False)['seconds_in_day'].transform(
                                lambda x: x.rolling(rolling_period,min_periods=min_obs).quantile(
                                q=upper_time_quantile)).reset_index(level=0,drop=True)

# outside_time_bene_range:   indicator of whether transaction is outside range for beneficiary at
#                            time of transaction, 0 no, 1 yes
JPfraud = JPfraud.assign(outside_time_bene_range = lambda x: ((x.seconds_in_day < x.time_min_in) | (x.seconds_in_day > x.time_max_in))*1)

# outside_time_sender_range: indicator of whether transaction is outside usual time range for sender
#                            at time of transaction, 0 no, 1 yes
JPfraud = JPfraud.assign(outside_time_sender_range = lambda x: ((x.seconds_in_day < x.time_min_out) | (x.seconds_in_day > x.time_max_out))*1)

#JPfraud