# Intro

This notebooks takes a pre-processed dataframe, and encodes the data so it can be used to train Banksformer.  


The input dataframe requires following columns: 
- tcode - String, encodes transaction type 
- amount - float, transcation amount (not log) 
- account_id - int, associates transactions with account
- age - int, clients age
- datetime - datetime object, date of transaction
- day, month, dow - all ints, encode day, month and day of week
- td - int/float, time delta, encodes number of days since the last transaction 

The encoded data will be tensor of shape (n_samples, max_seq_len, feats_per_step).

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import time

import pickle
from datetime import date
import tensorflow as tf

## Setup

In [3]:
ds_suffix = "-czech"
max_seq_len = 80
min_seq_len = 20

### Load dataframe

In [4]:
df = pd.read_csv(f"stored_data/final_df-{ds_suffix}.csv")
df

Unnamed: 0.1,Unnamed: 0,column_a,account_id,date,type,operation,amount,balance,k_symbol,age,tcode,datetime,month,day,dow,year,dtme,td
0,0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,,29,CREDIT__CREDIT IN CASH__nan,1995-03-24,3,24,4,1995,7,0.0
1,1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,,29,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1995-04-13,4,13,3,1995,17,20.0
2,2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,,29,CREDIT__CREDIT IN CASH__nan,1995-04-23,4,23,6,1995,7,10.0
3,3,162681,1,950430,CREDIT,,19.2,17298.2,INTEREST CREDITED,29,CREDIT__nan__INTEREST CREDITED,1995-04-30,4,30,6,1995,0,7.0
4,4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,,29,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1995-05-13,5,13,5,1995,18,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,1056315,1033141,11382,981202,DEBIT,CASH WITHDRAWAL,25600.0,41114.4,,46,DEBIT__CASH WITHDRAWAL__nan,1998-12-02,12,2,2,1998,29,2.0
1056316,1056316,1040574,11382,981210,CREDIT,COLLECTION FROM ANOTHER BANK,46248.0,87362.4,,46,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1998-12-10,12,10,3,1998,21,8.0
1056317,1056317,1050362,11382,981225,DEBIT,CASH WITHDRAWAL,6300.0,81062.4,,46,DEBIT__CASH WITHDRAWAL__nan,1998-12-25,12,25,4,1998,6,15.0
1056318,1056318,1053037,11382,981231,CREDIT,,311.3,81373.6,INTEREST CREDITED,46,CREDIT__nan__INTEREST CREDITED,1998-12-31,12,31,3,1998,0,6.0


## Ensure correct folders exist

In [5]:
folders = ['generated_data',
 'stored_data',
 'checkpoints',
 'generation_results',
 'data',
 'my_lib']


for f in folders:
    if not os.path.exists(f):
        os.mkdir(f)

## Encode

In [6]:
from my_lib.encoding import preprocess_df, bulk_encode_time_value
from field_config import CAT_FIELDS

preprocess_df(df, CAT_FIELDS, ds_suffix)

DATA_KEY_ORDER is ['tcode_num', 'dow', 'month', 'day', 'dtme', 'td_sc', 'log_amount_sc']
LOSS_TYPES are: day - scce, dtme - scce, dow - scce, month - scce, td_sc - pdf, log_amount_sc - pdf, tcode_num - scce
If this is not correct, edit field_config.py and re-run notebook
iniy!
Wrote encoding info to stored_data/DataEncoder--czech.pickle


<my_lib.encoding.DataEncoder at 0x7fb619f63610>

In [12]:
# from my_lib.field_config import *

from field_config import get_field_info, DATA_KEY_ORDER, CLOCK_DIMS, INP_ENCODINGS, TAR_ENCODINGS


def count_seqs_in_df(df):
    gb_aid = df.groupby("account_id")["account_id"]

    full_seqs_per_acct = gb_aid.count() // max_seq_len

    n_full_seqs = sum(full_seqs_per_acct)
    n_part_seqs = sum(gb_aid.count() - full_seqs_per_acct*max_seq_len >= min_seq_len)
    
    return n_full_seqs + n_part_seqs






def seq_to_inp_tensor(seq, inp_tensor, seq_i, seq_len):

    for k in DATA_KEY_ORDER:

        depth = FIELD_DIMS_IN[k]
        st = FIELD_STARTS_IN[k]
        enc_type = INP_ENCODINGS[k]
        
        if enc_type == "oh":
            x = tf.one_hot(seq[k], depth).numpy()
        elif enc_type == "cl":
            max_val = CLOCK_DIMS[k]
            x = bulk_encode_time_value(seq[k], max_val)
        elif enc_type == "raw":
            x = np.expand_dims(seq[k], 1)
        else:
            raise Exception(f"Got invalid enc_type: {enc_type}")
            
            

        inp_tensor[seq_i,:seq_len, st:st+depth] = x




def seq_to_targ_tensor(seq, tar_tensor, seq_i, seq_len):

    for k in DATA_KEY_ORDER:
        
        depth = FIELD_DIMS_TAR[k]
        st = FIELD_STARTS_TAR[k]
        enc_type = TAR_ENCODINGS[k]
        
        if enc_type == "cl-i":
            max_val = CLOCK_DIMS[k]
            x = np.expand_dims(seq[k]%max_val, 1)
        elif enc_type == "raw":
            x = np.expand_dims(seq[k], 1)
        else:
            raise Exception(f"Got invalid enc_type: {enc_type}")
            
        
        tar_tensor[seq_i,:seq_len, st:st+depth] = x

        
FIELD_DIMS_IN, FIELD_STARTS_IN, FIELD_DIMS_TAR, FIELD_STARTS_TAR, FIELD_DIMS_NET, FIELD_STARTS_NET = get_field_info(ds_suffix)

In [15]:
n_seqs = count_seqs_in_df(df)
n_steps = max_seq_len
n_feat_inp = sum(FIELD_DIMS_IN.values())
n_feat_tar = sum(FIELD_DIMS_TAR.values())

inp_tensor = np.zeros((n_seqs, n_steps, n_feat_inp))
tar_tensor = np.zeros((n_seqs, n_steps, n_feat_tar))

inp_tensor.shape, tar_tensor.shape

((14354, 80, 26), (14354, 80, 7))

In [16]:
seq_i = 0
rows_per_acct = {}
alert_every = 2000
attribute = "age_sc"


attributes = np.zeros(n_seqs)
start_time = time.time()
for acct_id, group in df.groupby("account_id"):
    rows_per_acct[acct_id] = []
    
    for i in range(len(group) // max_seq_len + 1):

        n_trs = len(group)
        start = i*max_seq_len
        seq_len = min(max_seq_len, n_trs - start)   

        if seq_len >= min_seq_len:
            seq_to_inp_tensor(group.iloc[start:start+seq_len], inp_tensor, seq_i, seq_len)
            seq_to_targ_tensor(group.iloc[start:start+seq_len],tar_tensor, seq_i, seq_len)
#             tar_tensor[seq_i,:seq_len,:] = seq_to_targ_tensor(group.iloc[start:start+seq_len])
            attributes[seq_i] = group["age"].iloc[0]

            rows_per_acct[acct_id].append(seq_i)
            seq_i += 1
            
            if seq_i % alert_every == 0:
                print(f"Finished encoding {seq_i} of {n_seqs} seqs")
                
 
# Add conditioning info (attribute) to first timestep of inp
inp_tensor = np.concatenate([np.repeat(attributes[:, None, None], n_feat_inp, axis=2), 
                             inp_tensor], 
                             axis=1)
print(f"Took {time.time() - start_time:.2f} secs")

Finished encoding 2000 of 14354 seqs
Finished encoding 4000 of 14354 seqs
Finished encoding 6000 of 14354 seqs
Finished encoding 8000 of 14354 seqs
Finished encoding 10000 of 14354 seqs
Finished encoding 12000 of 14354 seqs
Finished encoding 14000 of 14354 seqs
Took 97.21 secs


In [17]:
inp_tensor.shape, tar_tensor.shape, attributes.shape

((14354, 81, 26), (14354, 80, 7), (14354,))

## Save

In [18]:
np.save(f"stored_data/inp_tensor-{ds_suffix}", inp_tensor)
np.save(f"stored_data/tar_tensor-{ds_suffix}", tar_tensor)
np.save(f"stored_data/attributes-{ds_suffix}", attributes)

In [19]:
with open(f"stored_data/rows_per_acct-{ds_suffix}.pickle", "wb") as f:
    pickle.dump(rows_per_acct, f) 

In [22]:
inp_tensor= np.load(f"stored_data/inp_tensor--czech.npy")
tar_tensor = np.load(f"stored_data/tar_tensor--czech.npy")
attributes = np.load(f"stored_data/attributes--czech.npy")