In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import calendar

df = pd.read_csv('../DATA/tr_by_acct_w_age.csv')

czech_date_parser = lambda x: datetime.strptime(str(x), "%y%m%d")
df["datetime"] = df["date"].apply(czech_date_parser)
#df["datetime"] = pd.to_datetime(df["datetime"])

df["month"] = df["datetime"].dt.month % 12
df["day"] = df["datetime"].dt.day % 31
df["dow"] =  df["datetime"].dt.dayofweek % 7
df["year"] = df["datetime"].dt.year

df["td"] = df[["account_id", "datetime"]].groupby("account_id").diff()
df["td"] = df["td"].apply(lambda x: x.days)
df["td"].fillna(0.0, inplace=True)


# dtme - days till month end
df["dtme"] = df.datetime.apply(lambda dt: calendar.monthrange(dt.year, dt.month)[1] - dt.day) % 31

df['raw_amount'] = df.apply(lambda row: row['amount'] if row['type'] == 'CREDIT' else -row['amount'], axis=1)


cat_code_fields = ['type', 'operation', 'k_symbol']
TCODE_SEP = "__"
# create tcode by concating fields in "cat_code_fields"
tcode = df[cat_code_fields[0]].astype(str)
for ccf in cat_code_fields[1:]:
    tcode += TCODE_SEP + df[ccf].astype(str)

df["tcode"] = tcode

ATTR_SCALE = df["age"].std()
df["age_sc"] = df["age"] / ATTR_SCALE

df["log_amount"] = np.log10(df["amount"]+1)
LOG_AMOUNT_SCALE = df["log_amount"].std()
df["log_amount_sc"] = df["log_amount"] / LOG_AMOUNT_SCALE
    
TD_SCALE = df["td"].std()
df["td_sc"] = df["td"] / TD_SCALE

field_mappings = {}
for field in cat_code_fields:
    # Optionally remove the "_num" suffix if present
    # field = field.replace("_num", "")
    
    # Create the category to number mapping
    cat_to_num = dict([(tc, i) for i, tc in enumerate(df[field].unique())])
    
    # Store the mappings in the field_mappings dictionary
    field_mappings[f"{field}_to_num".upper()] = cat_to_num
    field_mappings[f"num_to_{field}".upper()] = dict([(i, tc) for i, tc in enumerate(df[field].unique())])

    df[field + "_num"] = df[field].apply(lambda x: cat_to_num[x])
    
    # add '_' to nan and blank so they are always interpreted as strings
    df[field] = df[field].astype(str).apply(lambda x: "_" + x  if x in ["nan", ""]  else x)




# TCODE_TO_NUM = dict([(tc, i) for i, tc in enumerate(df['tcode'].unique())])
# NUM_TO_TCODE = dict([(i, tc) for i, tc in enumerate(df['tcode'].unique())])

# df['tcode' + "_num"] = df['tcode'].apply(lambda x: TCODE_TO_NUM[x])
# START_DATE = df["datetime"].min()

In [2]:
field_mappings.keys()

dict_keys(['TYPE_TO_NUM', 'NUM_TO_TYPE', 'OPERATION_TO_NUM', 'NUM_TO_OPERATION', 'K_SYMBOL_TO_NUM', 'NUM_TO_K_SYMBOL'])

In [6]:
len(field_mappings['K_SYMBOL_TO_NUM'])

9

In [3]:
df

Unnamed: 0.1,Unnamed: 0,column_a,account_id,date,type,operation,amount,balance,k_symbol,age,...,td,dtme,raw_amount,age_sc,log_amount,log_amount_sc,td_sc,type_num,operation_num,k_symbol_num
0,0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,_nan,29,...,0.0,7,1000.0,1.745524,3.000434,2.823750,0.000000,0,0,0
1,1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,_nan,29,...,20.0,17,3679.0,1.745524,3.565848,3.355869,3.298201,0,1,0
2,2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,_nan,29,...,10.0,7,12600.0,1.745524,4.100405,3.858949,1.649100,0,0,0
3,3,162681,1,950430,CREDIT,_nan,19.2,17298.2,INTEREST CREDITED,29,...,7.0,0,19.2,1.745524,1.305351,1.228484,1.154370,0,2,1
4,4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,_nan,29,...,13.0,18,3679.0,1.745524,3.565848,3.355869,2.143831,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,1056315,1033141,11382,981202,DEBIT,CASH WITHDRAWAL,25600.0,41114.4,_nan,46,...,2.0,29,-25600.0,2.768763,4.408257,4.148672,0.329820,1,3,0
1056316,1056316,1040574,11382,981210,CREDIT,COLLECTION FROM ANOTHER BANK,46248.0,87362.4,_nan,46,...,8.0,21,46248.0,2.768763,4.665102,4.390393,1.319280,0,1,0
1056317,1056317,1050362,11382,981225,DEBIT,CASH WITHDRAWAL,6300.0,81062.4,_nan,46,...,15.0,6,-6300.0,2.768763,3.799409,3.575677,2.473651,1,3,0
1056318,1056318,1053037,11382,981231,CREDIT,_nan,311.3,81373.6,INTEREST CREDITED,46,...,6.0,0,311.3,2.768763,2.494572,2.347677,0.989460,0,2,1


In [3]:
import sys
sys.path.insert(0, '/users/fs2/hmehri/pythonproject/Thesis/synthetic')

from lib.prepare_data import preprocess_data_czech
from lib.field_info import FieldInfo
from lib.field_info_v2 import FieldInfo_v2
from lib.tensor_encoder import TensorEncoder
import pandas as pd

2023-12-27 18:55:52.058776: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [4]:
raw_data = pd.read_csv('../DATA/tr_by_acct_w_age.csv')
data, LOG_AMOUNT_SCALE, TD_SCALE,ATTR_SCALE, START_DATE, TCODE_TO_NUM, NUM_TO_TCODE = preprocess_data_czech(raw_data)
selected_data_columns = data[['account_id','age','age_sc', 'tcode', 'tcode_num', 'datetime', 'month', 'dow', 'day','td', 'dtme', 'log_amount','log_amount_sc','td_sc']]
df= selected_data_columns.copy()


In [3]:
df

Unnamed: 0,account_id,age,age_sc,tcode,tcode_num,datetime,month,dow,day,td,dtme,log_amount,log_amount_sc,td_sc
0,1,29,1.745524,CREDIT__CREDIT IN CASH__nan,0,1995-03-24,3,4,24,0.0,7,3.000434,2.823750,0.000000
1,1,29,1.745524,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1,1995-04-13,4,3,13,20.0,17,3.565848,3.355869,3.298201
2,1,29,1.745524,CREDIT__CREDIT IN CASH__nan,0,1995-04-23,4,6,23,10.0,7,4.100405,3.858949,1.649100
3,1,29,1.745524,CREDIT__nan__INTEREST CREDITED,2,1995-04-30,4,6,30,7.0,0,1.305351,1.228484,1.154370
4,1,29,1.745524,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1,1995-05-13,5,5,13,13.0,18,3.565848,3.355869,2.143831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,11382,46,2.768763,DEBIT__CASH WITHDRAWAL__nan,3,1998-12-02,0,2,2,2.0,29,4.408257,4.148672,0.329820
1056316,11382,46,2.768763,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1,1998-12-10,0,3,10,8.0,21,4.665102,4.390393,1.319280
1056317,11382,46,2.768763,DEBIT__CASH WITHDRAWAL__nan,3,1998-12-25,0,4,25,15.0,6,3.799409,3.575677,2.473651
1056318,11382,46,2.768763,CREDIT__nan__INTEREST CREDITED,2,1998-12-31,0,3,0,6.0,0,2.494572,2.347677,0.989460


In [5]:

n_tcodes = len(TCODE_TO_NUM)

fieldInfo = FieldInfo_v2('dateonehot')

max_seq_len = 80
min_seq_len = 20
 
encoder = TensorEncoder(df, fieldInfo, max_seq_len, min_seq_len)
encoder.encode()


2023-12-27 18:56:35.858074: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-12-27 18:56:35.859177: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-12-27 18:56:35.892129: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:3b:00.0 name: Quadro RTX 8000 computeCapability: 7.5
coreClock: 1.77GHz coreCount: 72 deviceMemorySize: 47.45GiB deviceMemoryBandwidth: 625.94GiB/s
2023-12-27 18:56:35.892289: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:af:00.0 name: Quadro RTX 8000 computeCapability: 7.5
coreClock: 1.77GHz coreCount: 72 deviceMemorySize: 47.45GiB deviceMemoryBandwidth: 625.94GiB/s
2023-12-27 18:56:35.892305: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-12-27 18:5

Finished encoding 2000 of 14354 seqs
Finished encoding 4000 of 14354 seqs
Finished encoding 6000 of 14354 seqs
Finished encoding 8000 of 14354 seqs
Finished encoding 10000 of 14354 seqs
Finished encoding 12000 of 14354 seqs
Finished encoding 14000 of 14354 seqs
Took 25.80 secs


In [6]:
encoder.inp_tensor.shape

(14354, 81, 99)

In [7]:
fieldInfo.FIELD_DIMS_IN, fieldInfo.INP_ENCODINGS, fieldInfo.FIELD_STARTS_IN

({'tcode_num': 16,
  'dow': 7,
  'month': 12,
  'day': 31,
  'dtme': 31,
  'td_sc': 1,
  'log_amount_sc': 1},
 {'day': 'oh_day',
  'dtme': 'oh_dtme',
  'dow': 'oh_dow',
  'month': 'oh_month',
  'td_sc': 'raw',
  'log_amount_sc': 'raw',
  'tcode_num': 'oh_tcode'},
 {'tcode_num': 0,
  'dow': 16,
  'month': 23,
  'day': 35,
  'dtme': 66,
  'td_sc': 97,
  'log_amount_sc': 98})

In [6]:
import numpy as np
import time
n_seqs = 5
n_feat_inp = sum(fieldInfo.FIELD_DIMS_IN.values())
n_feat_tar = sum(fieldInfo.FIELD_DIMS_TAR.values())
inp_tensor = np.zeros((n_seqs, max_seq_len, n_feat_inp))
tar_tensor = np.zeros((n_seqs, max_seq_len, n_feat_tar))
attributes = np.zeros(n_seqs)
seq_i = 0
rows_per_acct = {}
alert_every = 2000
start_time = time.time()
for acct_id, group in df.groupby("account_id"):
    rows_per_acct[acct_id] = []
    for i in range(len(group) // max_seq_len + 1):
        n_trs = len(group)
        start = i * max_seq_len
        seq_len = min(max_seq_len, n_trs - start)
        if seq_len >= min_seq_len:
            seq = group.iloc[start:start + seq_len]
            print(seq)
            print(type(seq))
            print(seq['dow'])
            print(type(seq['dow']))
            break
        break
    break
            # seq_to_inp_tensor(group.iloc[start:start + seq_len], seq_i, seq_len)
            # self.seq_to_targ_tensor(group.iloc[start:start + seq_len], seq_i, seq_len)
            # self.attributes[seq_i] = group["age_sc"].iloc[0]
            # rows_per_acct[acct_id].append(seq_i)
            # seq_i += 1
            # if seq_i % alert_every == 0:
            #     print(f"Finished encoding {seq_i} of {self.n_seqs} seqs")

    account_id  age    age_sc                                         tcode  \
0            1   29  1.745524                   CREDIT__CREDIT IN CASH__nan   
1            1   29  1.745524     CREDIT__COLLECTION FROM ANOTHER BANK__nan   
2            1   29  1.745524                   CREDIT__CREDIT IN CASH__nan   
3            1   29  1.745524                CREDIT__nan__INTEREST CREDITED   
4            1   29  1.745524     CREDIT__COLLECTION FROM ANOTHER BANK__nan   
..         ...  ...       ...                                           ...   
75           1   29  1.745524                   DEBIT__CASH WITHDRAWAL__nan   
76           1   29  1.745524  DEBIT__CASH WITHDRAWAL__PAYMENT ON STATEMENT   
77           1   29  1.745524                CREDIT__nan__INTEREST CREDITED   
78           1   29  1.745524  DEBIT__REMITTANCE TO ANOTHER BANK__HOUSEHOLD   
79           1   29  1.745524     CREDIT__COLLECTION FROM ANOTHER BANK__nan   

    tcode_num   datetime  month  dow  day    td  dt

In [None]:
import tensorflow as tf
import numpy as np
def bulk_encode_time_value(val, max_val):
        x = np.sin(2 * np.pi / max_val * val)
        y = np.cos(2 * np.pi / max_val * val)
        return np.stack([x, y], axis=1)

CLOCK_DIMS = {"day": 31,"dtme": 31,"dow": 7,"month": 12}
CLOCKS = {}
Reord_CLOCKS = {}
for k, val in CLOCK_DIMS.items():
    CLOCKS[k] = tf.constant(bulk_encode_time_value(np.arange(val), val), dtype=tf.float32)

In [5]:
def clock_to_probs(pt, pts):
    EPS_CLOCKP = tf.constant(0.01, dtype=tf.float32)
    ds = tf.constant(pts) - pt
    sq_ds = np.sum(tf.square(ds+EPS_CLOCKP), axis=1)
    raw_ps = 1/ sq_ds   
    
    return raw_ps / np.sum(raw_ps)

In [7]:
clock_to_probs([0,1], CLOCKS['day'])

0.99999994

: 

In [4]:
CLOCKS

{'day': <tf.Tensor: shape=(31, 2), dtype=float32, numpy=
 array([[ 0.        ,  1.        ],
        [ 0.20129852,  0.9795299 ],
        [ 0.39435586,  0.9189578 ],
        [ 0.5712682 ,  0.82076347],
        [ 0.7247928 ,  0.68896693],
        [ 0.84864426,  0.528964  ],
        [ 0.9377521 ,  0.34730524],
        [ 0.98846835,  0.15142778],
        [ 0.99871653, -0.05064917],
        [ 0.9680771 , -0.25065252],
        [ 0.89780456, -0.44039416],
        [ 0.7907757 , -0.61210597],
        [ 0.6513725 , -0.7587581 ],
        [ 0.48530197, -0.8743466 ],
        [ 0.29936314, -0.95413923],
        [ 0.10116832, -0.99486935],
        [-0.10116832, -0.99486935],
        [-0.29936314, -0.95413923],
        [-0.48530197, -0.8743466 ],
        [-0.6513725 , -0.7587581 ],
        [-0.7907757 , -0.61210597],
        [-0.89780456, -0.44039416],
        [-0.9680771 , -0.25065252],
        [-0.99871653, -0.05064917],
        [-0.98846835,  0.15142778],
        [-0.9377521 ,  0.34730524],
       