# Notes

Your DF needs the following fields for each transaction:  
 - amount (real valued transaction amount)  
 - datetime (must be parseable to datetime object by pandas parse_dates)  
 - account_id (categorical or integer, used to associate transactions with accounts)   

To set the datetime field in the czech dataset, use the following:  
from datetime import datetime  
czech_date_parser = lambda x: datetime.strptime(str(x), "%y%m%d")  
df["datetime"] = df["date"].apply(czech_date_parser)  

In [1]:
from my_lib.encoding import DataEncoder
import pandas as pd

# Things to set

In [2]:
CAT_FIELDS = ["k_symbol_num", "operation_num", "type_num"]
real_dataset = "data/tr_by_acct_w_age.csv"
datetime_field = 'datetime'

training_max_seq_len = 80
training_min_seq_len = 20

# Run to create config object

In [3]:
df = pd.read_csv(real_dataset, parse_dates=[datetime_field])
df.head()

Unnamed: 0.1,Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,client_id,age,datetime
0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,,1,29,1995-03-24
1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,,1,29,1995-04-13
2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,,1,29,1995-04-23
3,162681,1,950430,CREDIT,,19.2,17298.2,INTEREST CREDITED,1,29,1995-04-30
4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,,1,29,1995-05-13


In [4]:
data_encoder = DataEncoder(CAT_FIELDS)
data_encoder.fit_transform(df)
df.head()

Unnamed: 0.1,Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,client_id,age,...,year,dtme,td,log_amount,log_amount_sc,td_sc,age_sc,k_symbol_num,operation_num,type_num
0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,_nan,1,29,...,1995,7,0.0,3.000434,2.82375,0.0,1.745524,0,0,0
1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,_nan,1,29,...,1995,17,20.0,3.565848,3.355869,3.298201,1.745524,0,1,0
2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,_nan,1,29,...,1995,7,10.0,4.100405,3.858949,1.6491,1.745524,0,0,0
3,162681,1,950430,CREDIT,_nan,19.2,17298.2,INTEREST CREDITED,1,29,...,1995,0,7.0,1.305351,1.228484,1.15437,1.745524,1,2,0
4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,_nan,1,29,...,1995,18,13.0,3.565848,3.355869,2.143831,1.745524,0,1,0


In [5]:
DATE_FIELDS = ['dow', 'month', "day", 'dtme', 'td_sc']  
CONT_FIELDS = ['log_amount_sc']  
NON_OUT_FIELDS = ['bal_sc']

In [6]:
DATA_KEY_ORDER = CAT_FIELDS + DATE_FIELDS + CONT_FIELDS
TCODE_SEP = "__"

In [7]:
date_loss = "scce"

LOSS_TYPES = {"day": date_loss,
              "dtme": date_loss,
           "dow": date_loss,
           "month": date_loss,
            "td_sc": "pdf",
            "log_amount_sc": "pdf",
#             "tcode_num": "scce",
             }


In [8]:

# cl - clock encoding (2d)
# oh - One-hot encoding
# raw - no encoding
# cl-i -  clock integer: transforms [1, 2, ..., n] -> [1, 2, ..., n-1, 0]

INP_ENCODINGS = {"day": "cl",
                 "dtme": "cl",
           "dow": "cl",
           "month": "cl",
            "td_sc": "raw",
            "log_amount_sc": "raw",
            "tcode_num": "oh",}

TAR_ENCODINGS = {"day": "cl-i",
                 "dtme": "cl-i",
           "dow": "cl-i",
           "month": "cl-i",
            "td_sc": "raw",
            "log_amount_sc": "raw",
            "tcode_num": "raw",}







In [9]:
CLOCK_DIMS = {"day": 31,
              "dtme": 31,
           "dow": 7,
           "month": 12,}





for field in CAT_FIELDS:
    LOSS_TYPES[field] = "scce"
    INP_ENCODINGS[field] = "oh"
    TAR_ENCODINGS[field] = "raw"


FIELD_DIMS_IN  = {}
FIELD_DIMS_TAR = {}
FIELD_DIMS_NET = {}


ENCODING_INP_DIMS_BY_TYPE = {'cl':2, 
                             'oh':None, 
                             'raw':1}

ENCODING_TAR_DIMS_BY_TYPE = {'cl-i': 1, 
                             'raw': 1}






for k in DATA_KEY_ORDER:
    
    FIELD_DIMS_IN[k] = ENCODING_INP_DIMS_BY_TYPE[INP_ENCODINGS[k]]
    FIELD_DIMS_TAR[k] = ENCODING_TAR_DIMS_BY_TYPE[TAR_ENCODINGS[k]]
    
    if TAR_ENCODINGS[k] == "raw":
        FIELD_DIMS_NET[k] = 2
    elif TAR_ENCODINGS[k] == "cl-i":
        FIELD_DIMS_NET[k] = CLOCK_DIMS[k]
    else:
        raise Exception(f"Error getting network dim for field = {k}")
    
    

In [10]:

print("DATA_KEY_ORDER is", DATA_KEY_ORDER)
print("LOSS_TYPES are:", ", ".join([f"{x} - {y}" for x,y in LOSS_TYPES.items()]))
print("If this is not correct, edit field_config.py and re-run notebook")







for field in CAT_FIELDS:
    LOSS_TYPES[field] = "scce"
    INP_ENCODINGS[field] = "oh"
    TAR_ENCODINGS[field] = "raw"

    
    n = data_encoder.get_n_cats(field)
    FIELD_DIMS_IN[field] = n
#         ONE_HOT_DIMS[field] = n
    FIELD_DIMS_NET[field] = n



FIELD_STARTS_IN = {}
start = 0
for k in DATA_KEY_ORDER:

    FIELD_STARTS_IN[k] = start
    start += FIELD_DIMS_IN[k]



FIELD_STARTS_TAR = {}
start = 0
for k in DATA_KEY_ORDER:

    FIELD_STARTS_TAR[k] = start
    start += FIELD_DIMS_TAR[k]


FIELD_STARTS_NET = {}
start = 0
for k in DATA_KEY_ORDER:

    FIELD_STARTS_NET[k] = start
    start += FIELD_DIMS_NET[k]



    







DATA_KEY_ORDER is ['k_symbol_num', 'operation_num', 'type_num', 'dow', 'month', 'day', 'dtme', 'td_sc', 'log_amount_sc']
LOSS_TYPES are: day - scce, dtme - scce, dow - scce, month - scce, td_sc - pdf, log_amount_sc - pdf, k_symbol_num - scce, operation_num - scce, type_num - scce
If this is not correct, edit field_config.py and re-run notebook


In [11]:
for_fi = ['CAT_FIELDS', 'DATE_FIELDS', 'CONT_FIELDS', 'NON_OUT_FIELDS', 'LOSS_TYPES', 'TCODE_SEP', 'DATA_KEY_ORDER']
for_fi += ['FIELD_DIMS_IN', 'FIELD_STARTS_IN', 'FIELD_DIMS_TAR', 'FIELD_STARTS_TAR', 'FIELD_DIMS_NET', 'FIELD_STARTS_NET']
for_fi += ['training_min_seq_len', 'training_max_seq_len', 'CLOCK_DIMS', 'INP_ENCODINGS', 'TAR_ENCODINGS']

field_info = {}

for var in for_fi:
    cmd = f"field_info['{var}'] = {var}"
    print(cmd)
    exec(cmd)


field_info['CAT_FIELDS'] = CAT_FIELDS
field_info['DATE_FIELDS'] = DATE_FIELDS
field_info['CONT_FIELDS'] = CONT_FIELDS
field_info['NON_OUT_FIELDS'] = NON_OUT_FIELDS
field_info['LOSS_TYPES'] = LOSS_TYPES
field_info['TCODE_SEP'] = TCODE_SEP
field_info['DATA_KEY_ORDER'] = DATA_KEY_ORDER
field_info['FIELD_DIMS_IN'] = FIELD_DIMS_IN
field_info['FIELD_STARTS_IN'] = FIELD_STARTS_IN
field_info['FIELD_DIMS_TAR'] = FIELD_DIMS_TAR
field_info['FIELD_STARTS_TAR'] = FIELD_STARTS_TAR
field_info['FIELD_DIMS_NET'] = FIELD_DIMS_NET
field_info['FIELD_STARTS_NET'] = FIELD_STARTS_NET
field_info['training_min_seq_len'] = training_min_seq_len
field_info['training_max_seq_len'] = training_max_seq_len
field_info['CLOCK_DIMS'] = CLOCK_DIMS
field_info['INP_ENCODINGS'] = INP_ENCODINGS
field_info['TAR_ENCODINGS'] = TAR_ENCODINGS


In [12]:
data_encoder.field_info = field_info

# Write output data 

In [13]:
import os
import pickle

folders = ['generated_data',
 'stored_data',
 'training_history',
 'data',]


for f in folders:
    if not os.path.exists(f):
        os.mkdir(f)

In [14]:
df.to_csv(f"stored_data/final_df.csv", index=False)

In [15]:



# if "DataEncoder.pickle" in os.listdir("stored_data/"):
    
#     raise Exception("Error - DataEncoder already exists! If you wish to create a new encoder, delete the old one.")
    
# else:

with open(f"stored_data/DataEncoder.pickle", "wb") as f:
    pickle.dump(data_encoder, f) 
    print("Wrote encoding info to", f.name)

Wrote encoding info to stored_data/DataEncoder.pickle
