# Intro

This notebooks takes a pre-processed dataframe, and encodes the data so it can be used to train Banksformer.  

The encoded data will be tensor of shape (n_samples, max_seq_len, feats_per_step).

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import time

import pickle
from datetime import date
import tensorflow as tf

## Setup

In [3]:
from my_lib.encoding import load_data_encoder

data_encoder = load_data_encoder()
data_encoder

<my_lib.encoding.DataEncoder at 0x7fa721ebb490>

In [4]:
# ds_suffix = "-czech"
max_seq_len = data_encoder.field_info['training_max_seq_len']
min_seq_len = data_encoder.field_info['training_min_seq_len']

In [5]:
df = pd.read_csv(f"stored_data/final_df.csv")
df

Unnamed: 0.1,Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,client_id,age,...,year,dtme,td,log_amount,log_amount_sc,td_sc,age_sc,k_symbol_num,operation_num,type_num
0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,_nan,1,29,...,1995,7,0.0,3.000434,2.823750,0.000000,1.745524,0,0,0
1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,_nan,1,29,...,1995,17,20.0,3.565848,3.355869,3.298201,1.745524,0,1,0
2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,_nan,1,29,...,1995,7,10.0,4.100405,3.858949,1.649100,1.745524,0,0,0
3,162681,1,950430,CREDIT,_nan,19.2,17298.2,INTEREST CREDITED,1,29,...,1995,0,7.0,1.305351,1.228484,1.154370,1.745524,1,2,0
4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,_nan,1,29,...,1995,18,13.0,3.565848,3.355869,2.143831,1.745524,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,1033141,11382,981202,DEBIT,CASH WITHDRAWAL,25600.0,41114.4,_nan,13998,46,...,1998,29,2.0,4.408257,4.148672,0.329820,2.768763,0,3,1
1056316,1040574,11382,981210,CREDIT,COLLECTION FROM ANOTHER BANK,46248.0,87362.4,_nan,13998,46,...,1998,21,8.0,4.665102,4.390393,1.319280,2.768763,0,1,0
1056317,1050362,11382,981225,DEBIT,CASH WITHDRAWAL,6300.0,81062.4,_nan,13998,46,...,1998,6,15.0,3.799409,3.575677,2.473651,2.768763,0,3,1
1056318,1053037,11382,981231,CREDIT,_nan,311.3,81373.6,INTEREST CREDITED,13998,46,...,1998,0,6.0,2.494572,2.347677,0.989460,2.768763,1,2,0


In [6]:
vars_to_load =  ['DATA_KEY_ORDER', 'CLOCK_DIMS', 'INP_ENCODINGS', 'TAR_ENCODINGS']
vars_to_load += ['FIELD_DIMS_IN', 'FIELD_STARTS_IN', 'FIELD_DIMS_TAR', 'FIELD_STARTS_TAR', 'FIELD_DIMS_NET', 'FIELD_STARTS_NET']

for var in vars_to_load:
    cmd = f'{var} = data_encoder.field_info["{var}"]'
    print(cmd)
    exec(cmd)

DATA_KEY_ORDER = data_encoder.field_info["DATA_KEY_ORDER"]
CLOCK_DIMS = data_encoder.field_info["CLOCK_DIMS"]
INP_ENCODINGS = data_encoder.field_info["INP_ENCODINGS"]
TAR_ENCODINGS = data_encoder.field_info["TAR_ENCODINGS"]
FIELD_DIMS_IN = data_encoder.field_info["FIELD_DIMS_IN"]
FIELD_STARTS_IN = data_encoder.field_info["FIELD_STARTS_IN"]
FIELD_DIMS_TAR = data_encoder.field_info["FIELD_DIMS_TAR"]
FIELD_STARTS_TAR = data_encoder.field_info["FIELD_STARTS_TAR"]
FIELD_DIMS_NET = data_encoder.field_info["FIELD_DIMS_NET"]
FIELD_STARTS_NET = data_encoder.field_info["FIELD_STARTS_NET"]


In [7]:



def count_seqs_in_df(df):
    gb_aid = df.groupby("account_id")["account_id"]

    full_seqs_per_acct = gb_aid.count() // max_seq_len

    n_full_seqs = sum(full_seqs_per_acct)
    n_part_seqs = sum(gb_aid.count() - full_seqs_per_acct*max_seq_len >= min_seq_len)
    
    return n_full_seqs + n_part_seqs






def seq_to_inp_tensor(seq, inp_tensor, seq_i, seq_len):

    for k in DATA_KEY_ORDER:

        depth = FIELD_DIMS_IN[k]
        st = FIELD_STARTS_IN[k]
        enc_type = INP_ENCODINGS[k]
        
        if enc_type == "oh":
            x = tf.one_hot(seq[k], depth).numpy()
        elif enc_type == "cl":
            max_val = CLOCK_DIMS[k]
            x = bulk_encode_time_value(seq[k], max_val)
        elif enc_type == "raw":
            x = np.expand_dims(seq[k], 1)
        else:
            raise Exception(f"Got invalid enc_type: {enc_type}")
            
            

        inp_tensor[seq_i,:seq_len, st:st+depth] = x




def seq_to_targ_tensor(seq, tar_tensor, seq_i, seq_len):

    for k in DATA_KEY_ORDER:
        
        depth = FIELD_DIMS_TAR[k]
        st = FIELD_STARTS_TAR[k]
        enc_type = TAR_ENCODINGS[k]
        
        if enc_type == "cl-i":
            max_val = CLOCK_DIMS[k]
            x = np.expand_dims(seq[k]%max_val, 1)
        elif enc_type == "raw":
            x = np.expand_dims(seq[k], 1)
        else:
            raise Exception(f"Got invalid enc_type: {enc_type}")
            
        
        tar_tensor[seq_i,:seq_len, st:st+depth] = x

        

### Create the tensors used for training & validation

In [8]:
n_seqs = count_seqs_in_df(df)
n_steps = max_seq_len
n_feat_inp = sum(FIELD_DIMS_IN.values())
n_feat_tar = sum(FIELD_DIMS_TAR.values())

inp_tensor = np.zeros((n_seqs, n_steps, n_feat_inp))
tar_tensor = np.zeros((n_seqs, n_steps, n_feat_tar))

inp_tensor.shape, tar_tensor.shape

((14354, 80, 27), (14354, 80, 9))

In [9]:
from my_lib.encoding import bulk_encode_time_value


seq_i = 0
rows_per_acct = {}
alert_every = 2000
attribute = "age_sc"


attributes = np.zeros(n_seqs)
start_time = time.time()
for acct_id, group in df.groupby("account_id"):
    rows_per_acct[acct_id] = []
    
    for i in range(len(group) // max_seq_len + 1):

        n_trs = len(group)
        start = i*max_seq_len
        seq_len = min(max_seq_len, n_trs - start)   

        if seq_len >= min_seq_len:
            seq_to_inp_tensor(group.iloc[start:start+seq_len], inp_tensor, seq_i, seq_len)
            seq_to_targ_tensor(group.iloc[start:start+seq_len],tar_tensor, seq_i, seq_len)
#             tar_tensor[seq_i,:seq_len,:] = seq_to_targ_tensor(group.iloc[start:start+seq_len])
            attributes[seq_i] = group["age"].iloc[0]

            rows_per_acct[acct_id].append(seq_i)
            seq_i += 1
            
            if seq_i % alert_every == 0:
                print(f"Finished encoding {seq_i} of {n_seqs} seqs")
                
 
# Add conditioning info (attribute) to first timestep of inp
inp_tensor = np.concatenate([np.repeat(attributes[:, None, None], n_feat_inp, axis=2), 
                             inp_tensor], 
                             axis=1)
print(f"Took {time.time() - start_time:.2f} secs")

2022-04-13 19:42:00.584096: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Finished encoding 2000 of 14354 seqs
Finished encoding 4000 of 14354 seqs
Finished encoding 6000 of 14354 seqs
Finished encoding 8000 of 14354 seqs
Finished encoding 10000 of 14354 seqs
Finished encoding 12000 of 14354 seqs
Finished encoding 14000 of 14354 seqs
Took 43.63 secs


In [10]:
inp_tensor.shape, tar_tensor.shape, attributes.shape

((14354, 81, 27), (14354, 80, 9), (14354,))

### Split to train, cv datasets

In [11]:
from sklearn.model_selection import train_test_split

inp_tensor_tr, inp_tensor_cv, inds_tr, inds_cv, tar_tensor_tr, tar_tensor_cv = train_test_split(
    inp_tensor, np.arange(n_seqs), tar_tensor, test_size=0.2)

## Save

In [12]:
np.save(f"stored_data/inp_tensor", inp_tensor)
np.save(f"stored_data/tar_tensor", tar_tensor)
np.save(f"stored_data/attributes", attributes)

np.save(f"stored_data/inp_tensor_tr", inp_tensor_tr)
np.save(f"stored_data/tar_tensor_tr", tar_tensor_tr)
np.save(f"stored_data/inp_tensor_cv", inp_tensor_cv)
np.save(f"stored_data/tar_tensor_cv", tar_tensor_cv)
np.save(f"stored_data/inds_tr", inds_tr)
np.save(f"stored_data/inds_cv", inds_cv)

In [13]:
with open(f"stored_data/rows_per_acct.pickle", "wb") as f:
    pickle.dump(rows_per_acct, f) 