# In This Notebook

In this notebook, I'll create some books regarding the customer behavior and denylists. In a production environment, these books could be consulted by creating and Kafka topic that get the necessary data and consulting it, or even a batch process that is orquestrated via airflow and feeds a MongoDB, which can be read by an API. Since doing these would take a lot of time, I'll just assume the data will be sent by the request when developing my model.

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
keep_cols = [
    'treated_user_id', 
    'amount', 
    'processed_at', 
    'ac_owner_type',
    'ac_inserted_at', 
    'uo_flag_many_orgs',  
    'omot_value', 
    'omot_inserted_at',
    'omot_updated_at', 
    'og_inserted_at', 
    'um_inserted_at', 
    'um_updated_at',
    'age_range', 
    'number_of_selfies_sent', 
    'us_inserted_at',
]

In [4]:
lt = pd.read_csv('files/labeled_transactions_enriched.csv', sep=';')[keep_cols+['is_fraud']]

  lt = pd.read_csv('files/labeled_transactions_enriched.csv', sep=';')[keep_cols+['is_fraud']]


In [5]:
lt.head()

Unnamed: 0,treated_user_id,amount,processed_at,ac_owner_type,ac_inserted_at,uo_flag_many_orgs,omot_value,omot_inserted_at,omot_updated_at,og_inserted_at,um_inserted_at,um_updated_at,age_range,number_of_selfies_sent,us_inserted_at,is_fraud
0,a3c9aab8-8569-4c50-85a1-24d94c2fb0d1,9.932046e-05,2018-11-21 15:41:23,organization,2018-11-11 11:42:21.569236,1.0,EPP,2018-11-11 11:42:20.049020,2018-11-11 11:42:20.049020,2018-11-11 11:42:17.070072,,,,5.0,2018-11-11 11:42:16.661117,
1,a8339829-8163-49dd-a881-c3f5ca407a94,1.285222e-05,2018-10-22 23:43:38,organization,2018-07-02 20:08:21.256596,1.0,EPP,2018-07-02 20:08:17.474345,2018-08-24 16:28:02.057540,2018-07-02 20:08:17.093421,2018-07-02 20:15:32.540356,2018-07-02 20:15:32.540356,>= 60,5.0,2018-07-02 20:08:16.699874,
2,a8339829-8163-49dd-a881-c3f5ca407a94,1.582566e-05,2018-11-19 16:12:00,organization,2018-07-02 20:08:21.256596,1.0,EPP,2018-07-02 20:08:17.474345,2018-08-24 16:28:02.057540,2018-07-02 20:08:17.093421,2018-07-02 20:15:32.540356,2018-07-02 20:15:32.540356,>= 60,5.0,2018-07-02 20:08:16.699874,
3,a8339829-8163-49dd-a881-c3f5ca407a94,5.672962e-07,2018-10-29 15:20:49,organization,2018-07-02 20:08:21.256596,1.0,EPP,2018-07-02 20:08:17.474345,2018-08-24 16:28:02.057540,2018-07-02 20:08:17.093421,2018-07-02 20:15:32.540356,2018-07-02 20:15:32.540356,>= 60,5.0,2018-07-02 20:08:16.699874,
4,a8339829-8163-49dd-a881-c3f5ca407a94,4.470917e-05,2018-10-31 12:06:08,organization,2018-07-02 20:08:21.256596,1.0,EPP,2018-07-02 20:08:17.474345,2018-08-24 16:28:02.057540,2018-07-02 20:08:17.093421,2018-07-02 20:15:32.540356,2018-07-02 20:15:32.540356,>= 60,5.0,2018-07-02 20:08:16.699874,


In [6]:
lt.isna().mean()

treated_user_id           0.015312
amount                    0.000000
processed_at              0.000000
ac_owner_type             0.000000
ac_inserted_at            0.000000
uo_flag_many_orgs         0.125484
omot_value                0.285591
omot_inserted_at          0.226773
omot_updated_at           0.226773
og_inserted_at            0.111149
um_inserted_at            0.132238
um_updated_at             0.132238
age_range                 0.132407
number_of_selfies_sent    0.015317
us_inserted_at            0.015326
is_fraud                  0.998369
dtype: float64

In [7]:
lt['is_fraud'] = lt['is_fraud'].fillna(0)

# Denylist

I intend to create a Denylist, that would not work as a model: I'll create a flag that indicates if, at the time of the fraud, the user has been associated with some fraudulent transaction in the past. I believe it will be a very good feature, if it is, I'll use it as an hard cut instead of feeding the transactions to the model.

Even though sometimes we can have medians of up to 20 days between a fraud and the moment it was reported, since I don't have this report-moment on the dataframe, I'll use the processed_at (moment the transaction was processed).

In [8]:
denylist = lt[lt.is_fraud == 1]
denylist = denylist.groupby('treated_user_id').agg(
    denylisted_at = pd.NamedAgg('processed_at', 'min')
).reset_index()
denylist['denylisted'] = 1

denylist

Unnamed: 0,treated_user_id,denylisted_at,denylisted
0,00a97c12-1222-4903-82ea-ab9c1961ff27,2018-10-19 19:14:12,1
1,02848250-b706-43e5-b81d-ca5e80b15d7d,2018-11-05 11:49:08,1
2,02a00bcb-a39a-4741-bcaf-b6b3ef6ab8c2,2018-10-18 15:14:32,1
3,03498637-2712-4410-a683-aaa6b0776dbb,2018-11-15 12:18:35,1
4,0362e038-0223-4b9e-a864-33b1a74cac62,2018-11-19 11:28:15,1
...,...,...,...
604,feb64fd1-8cfb-4bd2-8c74-fe71ed166cf1,2018-11-15 20:06:52,1
605,fec7c414-a9be-4e17-b97a-3c75701f0098,2018-10-29 12:55:11,1
606,feff421e-4393-42f6-8eb0-d976bed6bf14,2018-10-30 13:31:35,1
607,ff57b119-9dc4-4fa2-a825-8519f86d34bd,2018-10-18 19:29:16,1


In [9]:
def get_denylist(df, denylist=denylist, left_uid='treated_user_id', left_timestamp='processed_at'):
    """
    We could do better by checking if the left col has columns named 'denylisted_at'
    or 'denylisted', but this should be sufficiently efficient.
    """
    
    df2 = df.copy(deep=True)
    df2 = df2.merge(denylist, left_on=left_uid, right_on='treated_user_id', how='left')
    df2['diff_to_denylisted'] = (pd.to_datetime(df2[left_timestamp]) - pd.to_datetime(df2.denylisted_at)).dt.total_seconds()
    df2.loc[(df2.denylisted == 1)&(df2.diff_to_denylisted <= 0), 'denylisted'] = 0
    df2['denylisted'] = df2['denylisted'].fillna(0)
    df2.drop(columns=['diff_to_denylisted'])
    
    return df2

In [10]:
lt2 = get_denylist(lt)

In [11]:
lt2.groupby('denylisted').is_fraud.mean()

denylisted
0.0    0.000790
1.0    0.113146
Name: is_fraud, dtype: float64

In [12]:
lt2.groupby('denylisted').is_fraud.sum()

denylisted
0.0    738
1.0    797
Name: is_fraud, dtype: object

In [13]:
0.113146/0.000790

143.22278481012657

I'll keep these guys to try to infer some behaviour from the datapoints, but we can clearly see that the chance of a denylisted customer to have a fraudulent transaction is more than 140x higher than that of a not flagged customer. If using this does not exceed by much our 300 transactions a day threshold on the unlabeled data, I'll use them to hardcut some transactions.

In [14]:
ut = pd.read_csv('files/unlabeled_transactions_enriched.csv', sep=';')[keep_cols]

In [15]:
ut.head()

Unnamed: 0,treated_user_id,amount,processed_at,ac_owner_type,ac_inserted_at,uo_flag_many_orgs,omot_value,omot_inserted_at,omot_updated_at,og_inserted_at,um_inserted_at,um_updated_at,age_range,number_of_selfies_sent,us_inserted_at
0,f6938791-4f7a-4cbb-81c3-d2511f8d08d0,5.528906e-07,2018-07-15 00:00:25.000000,organization,2018-04-27 18:46:06.110555,1.0,ME,2018-04-27 00:05:38.248259,2018-04-27 00:05:38.248259,2018-04-27 00:05:37.897738,2018-04-27 00:09:29.893733,2018-04-27 00:09:29.893733,25-39,5.0,2018-04-27 00:05:37.477665
1,f6938791-4f7a-4cbb-81c3-d2511f8d08d0,6.086869e-08,2018-07-15 00:20:32.000000,organization,2018-04-27 18:46:06.110555,1.0,ME,2018-04-27 00:05:38.248259,2018-04-27 00:05:38.248259,2018-04-27 00:05:37.897738,2018-04-27 00:09:29.893733,2018-04-27 00:09:29.893733,25-39,5.0,2018-04-27 00:05:37.477665
2,f6938791-4f7a-4cbb-81c3-d2511f8d08d0,7.050623e-07,2018-09-01 23:51:38.000000,organization,2018-04-27 18:46:06.110555,1.0,ME,2018-04-27 00:05:38.248259,2018-04-27 00:05:38.248259,2018-04-27 00:05:37.897738,2018-04-27 00:09:29.893733,2018-04-27 00:09:29.893733,25-39,5.0,2018-04-27 00:05:37.477665
3,f6938791-4f7a-4cbb-81c3-d2511f8d08d0,7.152071e-07,2018-09-13 00:50:27.000000,organization,2018-04-27 18:46:06.110555,1.0,ME,2018-04-27 00:05:38.248259,2018-04-27 00:05:38.248259,2018-04-27 00:05:37.897738,2018-04-27 00:09:29.893733,2018-04-27 00:09:29.893733,25-39,5.0,2018-04-27 00:05:37.477665
4,f6938791-4f7a-4cbb-81c3-d2511f8d08d0,3.855017e-07,2018-09-22 23:50:38.000000,organization,2018-04-27 18:46:06.110555,1.0,ME,2018-04-27 00:05:38.248259,2018-04-27 00:05:38.248259,2018-04-27 00:05:37.897738,2018-04-27 00:09:29.893733,2018-04-27 00:09:29.893733,25-39,5.0,2018-04-27 00:05:37.477665


In [16]:
ut2 = get_denylist(ut)

In [17]:
ut2['transaction_date'] = pd.to_datetime(ut2['processed_at']).dt.date

In [18]:
ut2.reset_index(inplace=True)

In [19]:
ut2_daily_counter = ut2.groupby('transaction_date').agg(
    counter=pd.NamedAgg('index', 'count'),
    denylist=pd.NamedAgg('denylisted', 'sum')
).reset_index()

In [20]:
ut2_daily_counter.denylist.mean()

0.0

Strange, the join doesn't seem to work even though it uses the same function as the other join.

In [21]:
ut2.processed_at.min(), ut2.processed_at.max()

('2018-07-15 00:00:25.000000', '2018-10-17 23:59:58.000000')

In [22]:
lt2.processed_at.min(), lt2.processed_at.max()

('2018-10-18 00:00:25', '2018-11-24 23:58:54')

I hadn't realized the dataframes used two different time periods, so the Denylist strategy won't work.

# Behaviour Book

I will measure how many transactions the user has done in the past and their value.

In [23]:
lt2.head(2)

Unnamed: 0,treated_user_id,amount,processed_at,ac_owner_type,ac_inserted_at,uo_flag_many_orgs,omot_value,omot_inserted_at,omot_updated_at,og_inserted_at,um_inserted_at,um_updated_at,age_range,number_of_selfies_sent,us_inserted_at,is_fraud,denylisted_at,denylisted,diff_to_denylisted
0,a3c9aab8-8569-4c50-85a1-24d94c2fb0d1,9.9e-05,2018-11-21 15:41:23,organization,2018-11-11 11:42:21.569236,1.0,EPP,2018-11-11 11:42:20.049020,2018-11-11 11:42:20.049020,2018-11-11 11:42:17.070072,,,,5.0,2018-11-11 11:42:16.661117,0,,0.0,
1,a8339829-8163-49dd-a881-c3f5ca407a94,1.3e-05,2018-10-22 23:43:38,organization,2018-07-02 20:08:21.256596,1.0,EPP,2018-07-02 20:08:17.474345,2018-08-24 16:28:02.057540,2018-07-02 20:08:17.093421,2018-07-02 20:15:32.540356,2018-07-02 20:15:32.540356,>= 60,5.0,2018-07-02 20:08:16.699874,0,,0.0,


In [24]:
full_t_table_cols = ['treated_user_id', 'amount', 'processed_at']

full_t_table = lt[full_t_table_cols].append(ut[full_t_table_cols])
full_t_table.dropna(inplace=True)

  full_t_table = lt[full_t_table_cols].append(ut[full_t_table_cols])


In [25]:
full_t_table.drop_duplicates(subset=['treated_user_id', 'processed_at'], keep='first', inplace=True)

In [26]:
full_t_table.sort_values(by=['treated_user_id', 'processed_at'], inplace=True)

full_t_table['past_amount'] = full_t_table.groupby('treated_user_id').amount.cumsum() - full_t_table.amount
full_t_table['past_transactions'] = full_t_table.groupby('treated_user_id').amount.cumcount()

full_t_table.drop(columns=['amount'], inplace=True)

In [27]:
full_t_table.tail(15)

Unnamed: 0,treated_user_id,processed_at,past_amount,past_transactions
595611,fffaa4f5-5559-40d7-9679-9d41d2268e63,2018-11-22 18:45:39,0.000264,199
5059508,fffbf832-052e-43f2-8465-b325c5553f61,2018-08-02 19:55:23.854363,0.0,0
5059510,fffbf832-052e-43f2-8465-b325c5553f61,2018-08-03 10:28:51.386623,8e-06,1
5059509,fffbf832-052e-43f2-8465-b325c5553f61,2018-09-27 20:15:25.934235,1.5e-05,2
5059511,fffbf832-052e-43f2-8465-b325c5553f61,2018-09-28 09:33:32.212275,1.5e-05,3
5059507,fffbf832-052e-43f2-8465-b325c5553f61,2018-10-01 16:01:08.967369,1.5e-05,4
5059512,fffbf832-052e-43f2-8465-b325c5553f61,2018-10-01 16:22:23.175637,4.4e-05,5
759016,fffbf832-052e-43f2-8465-b325c5553f61,2018-10-19 13:27:58,7.3e-05,6
759017,fffbf832-052e-43f2-8465-b325c5553f61,2018-11-13 17:36:29,9.6e-05,7
759023,fffbf832-052e-43f2-8465-b325c5553f61,2018-11-22 20:01:38,9.6e-05,8


The Behaviour table seems to work just fine.

In [28]:
lt2 = lt.merge(full_t_table, on=['treated_user_id', 'processed_at'], how='left')

In [29]:
lt2.shape, lt.shape

((940935, 18), (940935, 16))

In [30]:
ut2 = ut.merge(full_t_table, on=['treated_user_id', 'processed_at'], how='left')

In [31]:
ut2.shape, ut.shape

((5087054, 17), (5087054, 15))

In [32]:
lt2.to_csv('files/labeled_transactions_enriched2.csv', sep=';', index=False)
ut2.to_csv('files/unlabeled_transactions_enriched2.csv', sep=';', index=False)