# Intro

For generating tensor of shape (n_samples, max_seq_len, feats_per_step) to be used as transformer input

The input dataframe requires following columns: 
- tcode - String, encodes transaction type 
- amount - float, transcation amount (not log) 
- account_id - int, associates transactions with account
- age - int, clients age
- datetime - datetime object, date of transaction
- day, month, dow - all ints, encode day, month and day of week 

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import time

import pickle
from datetime import date
import tensorflow as tf

In [2]:
# from my_lib.encoding import *

## Setup

In [3]:
ds_suffix = "vf10k"
max_seq_len = 80
min_seq_len = 20

### Load dataframe

In [4]:
df = pd.read_csv(f"stored_data/final_df-{ds_suffix}.csv")
df

Unnamed: 0,account_id,dateofbirth,totalvalue,transactiondate,transactiontype,transactionmethodtype,sourcefundtype,destinationfundtype,transactionactivitytype,age,direction,tcode,amount,datetime,month,day,dow,year,td
0,1000,1954-09-13 00:00:00,-2.05,2020-06-03 04:12:37,transfer_out,internet,account,eft,,67,db,transfer_out-internet-account-eft--db,2.05,2020-06-03,6,3,2,2020,0.0
1,1000,1954-09-13 00:00:00,70.06,2020-06-03 10:34:43,deposit,in_branch,cash,account,,67,cr,deposit-in_branch-cash-account--cr,70.06,2020-06-03,6,3,2,2020,0.0
2,1000,1954-09-13 00:00:00,-8.01,2020-06-04 02:50:52,transfer_out,pos,account,eft,,67,db,transfer_out-pos-account-eft--db,8.01,2020-06-04,6,4,3,2020,1.0
3,1000,1954-09-13 00:00:00,-6.17,2020-06-09 13:25:11,transfer_out,pos,account,eft,,67,db,transfer_out-pos-account-eft--db,6.17,2020-06-09,6,9,1,2020,5.0
4,1000,1954-09-13 00:00:00,-375.92,2020-06-10 06:25:00,transfer_out,ach,account,eft,,67,db,transfer_out-ach-account-eft--db,375.92,2020-06-10,6,10,2,2020,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2972464,10999,1937-09-08 00:00:00,-4.71,2021-05-18 21:22:33,transfer_out,pos,account,eft,,84,db,transfer_out-pos-account-eft--db,4.71,2021-05-18,5,18,1,2021,0.0
2972465,10999,1937-09-08 00:00:00,-192.46,2021-05-19 19:16:52,transfer_out,ach,account,eft,,84,db,transfer_out-ach-account-eft--db,192.46,2021-05-19,5,19,2,2021,1.0
2972466,10999,1937-09-08 00:00:00,198.13,2021-05-24 08:01:24,deposit,in_branch,cheque,account,,84,cr,deposit-in_branch-cheque-account--cr,198.13,2021-05-24,5,24,0,2021,5.0
2972467,10999,1937-09-08 00:00:00,0.17,2021-05-24 09:23:28,transfer_in,system,account,account,,84,cr,transfer_in-system-account-account--cr,0.17,2021-05-24,5,24,0,2021,0.0


## Ensure correct folders exist

In [5]:
folders = ['generated_data',
 'stored_data',
 'checkpoints',
 'generation_results',
 'data',
 'my_lib']


for f in folders:
    if not os.path.exists(f):
        os.mkdir(f)

## After DF is created

In [6]:
from my_lib.encoding import preprocess_df, bulk_encode_time_value

preprocess_df(df, ds_suffix)

Wrote encoding info to stored_data/DataEncoder-vf10k.pickle


<my_lib.encoding.DataEncoder at 0x7f80ba8a6a60>

In [11]:
from my_lib.field_config import *



def count_seqs_in_df(df):
    gb_aid = df.groupby("account_id")["account_id"]

    full_seqs_per_acct = gb_aid.count() // max_seq_len

    n_full_seqs = sum(full_seqs_per_acct)
    n_part_seqs = sum(gb_aid.count() - full_seqs_per_acct*max_seq_len >= min_seq_len)
    
    return n_full_seqs + n_part_seqs



        
        




def seq_to_inp_tensor(seq, inp_tensor, seq_i, seq_len):

    for k in DATA_KEY_ORDER:

        depth = FIELD_DIMS[k]
        
        if k in ONE_HOT_DIMS:    
            x = tf.one_hot(seq[k], depth).numpy()

        elif k in CLOCK_FIELDS:
            max_val = CLOCK_FIELDS[k]
            x = bulk_encode_time_value(seq[k], max_val)
        else:
            x = np.expand_dims(seq[k], 1)
            
        st = FIELD_STARTS[k]
        inp_tensor[seq_i,:seq_len, st:st+depth] = x




def seq_to_targ_tensor(seq, tar_tensor, seq_i, seq_len):

    for k in DATA_KEY_ORDER:
        
        depth = FIELD_DIMS_TAR[k]
        
        if k in CLOCK_FIELDS:
            max_val = CLOCK_FIELDS[k]
            x = np.expand_dims(seq[k]%max_val, 1)
        else:
            x = np.expand_dims(seq[k], 1)
            
        st = FIELD_STARTS_TAR[k]
        tar_tensor[seq_i,:seq_len, st:st+depth] = x

        
ONE_HOT_DIMS, FIELD_DIMS, FIELD_STARTS, FIELD_DIMS_TAR, FIELD_STARTS_TAR = get_field_info(ds_suffix)

In [8]:
FIELD_DIMS.values(), sum(FIELD_DIMS.values())

(dict_values([1, 2, 2, 2, 118, 1]), 126)

In [9]:
n_seqs = count_seqs_in_df(df)
n_steps = max_seq_len
n_feat_inp = sum(FIELD_DIMS.values())
n_feat_tar = sum(FIELD_DIMS_TAR.values())

inp_tensor = np.zeros((n_seqs, n_steps, n_feat_inp))
tar_tensor = np.zeros((n_seqs, n_steps, n_feat_tar))

inp_tensor.shape, tar_tensor.shape

((39487, 80, 126), (39487, 80, 6))

In [12]:
seq_i = 0
rows_per_acct = {}
alert_every = 2000
attribute = "age_sc"


attributes = np.zeros(n_seqs)
start_time = time.time()
for acct_id, group in df.groupby("account_id"):
    rows_per_acct[acct_id] = []
    
    for i in range(len(group) // max_seq_len + 1):

        n_trs = len(group)
        start = i*max_seq_len
        seq_len = min(max_seq_len, n_trs - start)   

        if seq_len >= min_seq_len:
            seq_to_inp_tensor(group.iloc[start:start+seq_len], inp_tensor, seq_i, seq_len)
            seq_to_targ_tensor(group.iloc[start:start+seq_len],tar_tensor, seq_i, seq_len)
#             tar_tensor[seq_i,:seq_len,:] = seq_to_targ_tensor(group.iloc[start:start+seq_len])
            attributes[seq_i] = group["age"].iloc[0]

            rows_per_acct[acct_id].append(seq_i)
            seq_i += 1
            
            if seq_i % alert_every == 0:
                print(f"Finished encoding {seq_i} of {n_seqs} seqs")
                
 
# Add conditioning info (attribute) to first timestep of inp
inp_tensor = np.concatenate([np.repeat(attributes[:, None, None], n_feat_inp, axis=2), 
                             inp_tensor], 
                             axis=1)
print(f"Took {time.time() - start_time:.2f} secs")

Finished encoding 2000 of 39487 seqs
Finished encoding 4000 of 39487 seqs
Finished encoding 6000 of 39487 seqs
Finished encoding 8000 of 39487 seqs
Finished encoding 10000 of 39487 seqs
Finished encoding 12000 of 39487 seqs
Finished encoding 14000 of 39487 seqs
Finished encoding 16000 of 39487 seqs
Finished encoding 18000 of 39487 seqs
Finished encoding 20000 of 39487 seqs
Finished encoding 22000 of 39487 seqs
Finished encoding 24000 of 39487 seqs
Finished encoding 26000 of 39487 seqs
Finished encoding 28000 of 39487 seqs
Finished encoding 30000 of 39487 seqs
Finished encoding 32000 of 39487 seqs
Finished encoding 34000 of 39487 seqs
Finished encoding 36000 of 39487 seqs
Finished encoding 38000 of 39487 seqs
Took 117.34 secs


In [13]:
inp_tensor.shape, tar_tensor.shape, attributes.shape

((39487, 81, 126), (39487, 80, 6), (39487,))

In [14]:
np.save(f"stored_data/inp_tensor-{ds_suffix}", inp_tensor)
np.save(f"stored_data/tar_tensor-{ds_suffix}", tar_tensor)
np.save(f"stored_data/attributes-{ds_suffix}", attributes)

In [15]:
with open(f"stored_data/rows_per_acct-{ds_suffix}.pickle", "wb") as f:
    pickle.dump(rows_per_acct, f) 

In [16]:
ds_suffix

'vf10k'