In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
tr = pd.read_csv('../../data/default/transactions_finetune.csv')
target = pd.read_csv('../../data/default/target_finetune.csv')

In [4]:
tr.head()

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,69,5541,48,-342.89792,2021-03-05 02:52:36
1,69,5533,48,-1251.8812,2021-03-05 09:43:28
2,69,5331,48,-87.30924,2021-03-05 11:17:23
3,69,5921,48,-1822.177,2021-03-05 13:41:03
4,69,5311,48,-427.12363,2021-03-05 19:14:23


In [5]:
target.head()

Unnamed: 0,user_id,target
0,452772,0
1,64288,0
2,504497,0
3,566270,0
4,328558,0


In [6]:
tr['transaction_dttm'] = pd.to_datetime(tr['transaction_dttm'], format='%Y-%m-%d %H:%M:%S')

In [7]:
tr.sort_values(by='transaction_dttm', inplace=True)

In [8]:
tr.reset_index(inplace=True, drop=True)

In [9]:
tr.head()

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,423716,5699,48,-1292.9285,2018-01-01 01:14:48
1,423716,5722,48,-20475.617,2018-01-01 01:26:05
2,423716,5977,48,-4346.5645,2018-01-01 21:58:19
3,423716,5411,48,-1610.7255,2018-01-01 22:35:37
4,34864,4816,48,-696.83,2018-01-01 23:43:33


In [10]:
enc = LabelEncoder()
tr['mcc_code'] = enc.fit_transform(tr['mcc_code'])

In [11]:
tr.head()

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,423716,136,48,-1292.9285,2018-01-01 01:14:48
1,423716,142,48,-20475.617,2018-01-01 01:26:05
2,423716,185,48,-4346.5645,2018-01-01 21:58:19
3,423716,108,48,-1610.7255,2018-01-01 22:35:37
4,34864,68,48,-696.83,2018-01-01 23:43:33


In [12]:
tr['mcc_code'].nunique()

309

In [13]:
df = tr.groupby('user_id')['mcc_code'].agg(lambda x: list(x))

In [15]:
df = pd.DataFrame(df)
df.reset_index(inplace=True)

In [16]:
df.head()

Unnamed: 0,user_id,mcc_code
0,69,"[119, 118, 106, 156, 105, 113, 108, 156, 108, ..."
1,140,"[108, 93, 51, 67, 108, 150, 108, 113, 108, 93,..."
2,196,"[55, 150, 150, 150, 108, 55, 55, 150, 150, 108..."
3,400,"[108, 196, 108, 196, 108, 108, 108, 119, 108, ..."
4,544,"[152, 108, 196, 197, 68, 51, 67, 150, 131, 108..."


In [18]:
df.shape

(7080, 2)

In [17]:
target.head()

Unnamed: 0,user_id,target
0,452772,0
1,64288,0
2,504497,0
3,566270,0
4,328558,0


In [19]:
target.shape

(7080, 2)

In [20]:
df = df.merge(target, how='inner',on='user_id')

In [21]:
df.head()

Unnamed: 0,user_id,mcc_code,target
0,69,"[119, 118, 106, 156, 105, 113, 108, 156, 108, ...",0
1,140,"[108, 93, 51, 67, 108, 150, 108, 113, 108, 93,...",0
2,196,"[55, 150, 150, 150, 108, 55, 55, 150, 150, 108...",0
3,400,"[108, 196, 108, 196, 108, 108, 108, 119, 108, ...",0
4,544,"[152, 108, 196, 197, 68, 51, 67, 150, 131, 108...",0


In [23]:
df.rename({'user_id': 'id', 'mcc_code': 'mcc'}, axis=1, inplace=True)

In [24]:
df.head()

Unnamed: 0,id,mcc,target
0,69,"[119, 118, 106, 156, 105, 113, 108, 156, 108, ...",0
1,140,"[108, 93, 51, 67, 108, 150, 108, 113, 108, 93,...",0
2,196,"[55, 150, 150, 150, 108, 55, 55, 150, 150, 108...",0
3,400,"[108, 196, 108, 196, 108, 108, 108, 119, 108, ...",0
4,544,"[152, 108, 196, 197, 68, 51, 67, 150, 131, 108...",0


In [25]:
all_l = []
for i in range(len(df)):
    all_l.append(len(df.loc[i, 'mcc']))

In [26]:
np.min(all_l), np.max(all_l), np.median(all_l)

(300, 300, 300.0)

### Train-valid-test split

In [83]:
r = df['target'].sum() / len(target) 

In [84]:
one_id = np.array(df.loc[df['target'] == 1, 'id'])

In [85]:
zero_id = np.array(df.loc[df['target'] == 0, 'id'])

In [86]:
train_id = np.hstack([np.random.choice(one_id, size=int(0.7*len(df)*r), replace=False), 
                     zero_id[:int(0.7 * len(zero_id))]])
rest_id = np.setdiff1d(one_id, train_id)             
valid_id = np.hstack([np.random.choice(rest_id, size=int(0.1*len(df)*r), replace=False), 
                      zero_id[int(0.7 * len(zero_id)):int(0.8 * len(zero_id))]])
test_id = np.hstack([np.setdiff1d(rest_id, valid_id), zero_id[int(0.8 * len(zero_id)):]])

In [87]:
train_df = df.loc[df['id'].isin(train_id)]
valid_df = df.loc[df['id'].isin(valid_id)]
test_df = df.loc[df['id'].isin(test_id)]

In [88]:
train_df['target'].sum() / len(train_df), valid_df['target'].sum() / len(valid_df), \
test_df['target'].sum() / len(test_df)

(0.03693239152371342, 0.03672316384180791, 0.0374029640084686)

In [89]:
len(train_id) / len(df), len(valid_id) / len(df), len(test_id) / len(df)

(0.6998587570621468, 0.1, 0.2001412429378531)

In [90]:
train_df.reset_index(inplace=True, drop=True)
valid_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [93]:
train_df.to_csv('../../data/processed_default/train.csv')
valid_df.to_csv('../../data/processed_default/valid.csv')
test_df.to_csv('../../data/processed_default/test.csv')