In [1]:
import pandas as pd
import numpy as np

We'll start by reading in our fraud dataset and looking at the column names:

In [2]:
df = pd.read_csv("fraud.csv")
df.columns

Index(['timestamp', 'label', 'user_id', 'amount', 'merchant_id', 'trans_type',
       'foreign'],
      dtype='object')

# Transaction type distribution

In [3]:
pt = pd.pivot_table(df[["label", "trans_type", "timestamp"]], 
                    index=["label", "trans_type"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

Unnamed: 0,label,trans_type,count,total,pctage
0,fraud,chip_and_pin,21007,412839,0.050884
1,fraud,contactless,20678,412839,0.050087
2,fraud,manual,144390,412839,0.349749
3,fraud,online,206163,412839,0.499379
4,fraud,swipe,20601,412839,0.049901
5,legitimate,chip_and_pin,4507746,22533292,0.200048
6,legitimate,contactless,5631831,22533292,0.249934
7,legitimate,manual,1128292,22533292,0.050072
8,legitimate,online,7886722,22533292,0.350003
9,legitimate,swipe,3378701,22533292,0.149943


In [4]:
import altair as alt

alt.Chart(gdf).mark_bar().encode(
    alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='trans_type', x="label", color='label'
)

# Foreign transaction distribution

In [5]:
pt = pd.pivot_table(df[["label", "foreign", "timestamp"]], 
                    index=["label", "foreign"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

alt.Chart(gdf).mark_bar().encode(
    alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='foreign', x="label", color='label'
)

# Transaction amount distribution

In [26]:
%%time
qs = df[['label','amount']].groupby('label').quantile(q=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])
qs

CPU times: user 1min 21s, sys: 2.47 s, total: 1min 24s
Wall time: 1min 59s


Unnamed: 0_level_0,Unnamed: 1_level_0,amount
label,Unnamed: 1_level_1,Unnamed: 2_level_1
fraud,0.01,5.35
fraud,0.05,6.74
fraud,0.1,8.98
fraud,0.25,14.71
fraud,0.5,22.61
fraud,0.75,28.17
fraud,0.9,35.33
fraud,0.95,40.69
fraud,0.99,53.04
legitimate,0.01,3.31


In [99]:
qdf = pd.DataFrame(qs.to_records())
alt.Chart(qdf).mark_line(interpolate="monotone").encode(
    alt.Y("amount", axis=alt.Axis(title='transaction amounts (log scale)'), scale=alt.Scale(type='log')), 
    alt.X("level_0", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
    color="label"
)

# Interarrival times

In [59]:
trans_times = df[['timestamp', 'label', 'user_id']].sort_values(['user_id', 'timestamp'])
trans_times['date'] = pd.to_datetime(trans_times['timestamp'], unit='s')

In [62]:
trans_times.groupby(['user_id', 'label']).diff()

Unnamed: 0,timestamp,date
29775,,NaT
36379,9426.0,02:37:06
42580,8769.0,02:26:09
48212,8490.0,02:21:30
54290,8942.0,02:29:02
...,...,...
22902733,44815.0,12:26:55
22910628,11438.0,03:10:38
22917465,9991.0,02:46:31
22924314,9837.0,02:43:57


In [88]:
interarrivals = trans_times.set_index(['user_id', 'label', 'date']).diff().reset_index()
interarrivals.columns = ['user_id', 'label', 'date', 'interarrival']

In [168]:
fraudsamp = interarrivals[interarrivals['label'] == 'fraud'].sample(100000)
legitsamp = interarrivals[interarrivals['label'] == 'legitimate'].sample(100000)




In [175]:
fraudsamp['irank'] = fraudsamp['interarrival'].rank(pct=True)
legitsamp['irank'] = legitsamp['interarrival'].rank(pct=True)
qdf = pd.concat([fraudsamp.groupby(['label', 'interarrival', 'irank']).size(), legitsamp.groupby(['label', 'interarrival', 'irank']).size()])

In [182]:
qdf = pd.DataFrame(pd.DataFrame(qdf).to_records())

In [166]:
[df[(df['irank'] < (0.1 * (i + 1))) & (df['irank'] >= (0.1 * i))].sample(25) for i in range(10) for df in [fraudsamp]]


[          user_id  label                date  interarrival     irank
 13878946     6049  fraud 2020-10-20 13:22:09         412.0  0.026200
 13026495     5668  fraud 2020-12-20 10:31:45         487.0  0.030825
 22856547     9963  fraud 2020-05-20 02:14:32         721.0  0.089450
 7336546      3203  fraud 2021-01-02 23:26:04         118.0  0.007065
 20507765     8928  fraud 2020-10-13 05:16:17         721.0  0.089450
 1294665       560  fraud 2020-08-29 04:15:25         721.0  0.089450
 20872754     9086  fraud 2020-05-23 01:58:04         721.0  0.089450
 14741285     6421  fraud 2020-07-01 00:16:18         466.0  0.029435
 10936261     4759  fraud 2021-01-09 09:03:34         721.0  0.089450
 79463          35  fraud 2020-10-13 00:27:21         555.0  0.035705
 22134782     9652  fraud 2020-04-24 11:30:01         650.0  0.042015
 19563943     8517  fraud 2020-03-21 02:12:54         721.0  0.089450
 9748717      4251  fraud 2020-03-28 05:03:17         721.0  0.089450
 8640101      3762  

In [194]:
alt.Chart(qdf.dropna().sample(5000)).mark_line(interpolate="monotone").encode(
    alt.Y("interarrival", axis=alt.Axis(title='interarrival time'), scale=alt.Scale(type='log')), 
    alt.X("irank", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
    color="label"
).interactive()