## Setup

Notebook can train a model to generate sythetic data.   
Ensure the 'ds_suffix' matches the one used to generated the dataset (Under "Set input dataset" & in create_dataset notebook)  
Parameters for generating data (seq_len, number of seqs) are near bottom (Under "Generate Full dataset")

In [1]:
import logging
import os
import sys
import time
import datetime

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf
import seaborn as sns

In [2]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

## Set generate info

In [3]:
seq_len = 80
n_seqs_to_generate = None    # if None do same as # of seqs in dataset with ds_suffix (below)
age_distribution = None      # dict mapping age->frequency, if None use same as dataset 

### Set input dataset and nb_id (Must match the info in the notebook used to train the Transformer)

In [4]:
from my_lib.field_config import *
ds_suffix = "vfdata"
nb_id = "vf1"


DATA_KEY_ORDER is ['td_sc', 'month', 'day', 'dow', 'tcode_num', 'log_amount_sc']
If this is not correct, edit my_lib/field_config.py and re-run notebook


## Load data

### Set activations

In [5]:
# Any fields not here will have activation=None
ACTIVATIONS = {
    "td_sc": "relu",
    "log_amount_sc": "relu"
}

# Training

## Loss

In [6]:
from tensorflow.keras.losses import CategoricalCrossentropy, MeanSquaredError, SparseCategoricalCrossentropy


loss_scce_logit = SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

loss_scce_probit = SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')

loss_mse = MeanSquaredError(reduction='none')



def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(tf.reduce_sum(seq, axis=2), 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [7]:
from my_lib.encoding import bulk_encode_time_value

EPS_CLOCKP = 0.01

CLOCKS = {}
for k, val in CLOCK_FIELDS.items():
    CLOCKS[k] = tf.constant(bulk_encode_time_value(np.arange(val), val), dtype=tf.float32)

def clock_to_probs(pt, pts):
    
    ds = tf.constant(pts) - pt
    sq_ds = np.sum(tf.square(ds+EPS_CLOCKP), axis=1)
    raw_ps = 1/ sq_ds   
    
    return raw_ps / np.sum(raw_ps)



def clock_to_onehot(k, vals):
    orig_shape = vals.shape

    vals = tf.reshape(vals, (-1, orig_shape[-1]))

    return np.array([clock_to_probs(p, CLOCKS[k]) for p in vals]).reshape(*orig_shape[:-1], -1)   



In [8]:
config = {}

num_layers_enc = 2
num_layers_dec = 1
d_model = 8
dff = 32
num_heads = 2
dropout_rate = 0.1

config["PRE_DATE_ORDER"] = PRE_DATE_ORDER
config["DATE_ORDER"] = DATE_ORDER
config["POST_DATE_ORDER"] = POST_DATE_ORDER
config["FIELD_STARTS"] = FIELD_STARTS
config["FIELD_DIMS"] = FIELD_DIMS
config["ACTIVATIONS"] = ACTIVATIONS

In [9]:
n_feat_inp = 69
n_feat_tar = 6

In [10]:
age_distribution = None

In [11]:
if n_seqs_to_generate is None:
    n_seqs_to_generate = len(np.load(f"stored_data/attributes-{ds_suffix}.npy"))
    
    
if age_distribution is None:
    attributes = np.load(f"stored_data/attributes-{ds_suffix}.npy")
    age_distribution = dict([(a, np.sum(attributes==a)/len(attributes)) for a in np.unique(attributes)])

## Training Loop 

In [12]:
from my_lib.BanksformerGen import Transformer

EARLY_STOP = 2
EPOCHS = 80

opt_name = "adam"
dr = 0.1
                
dff = 128
num_layers_dec = 4
d_model = 128


all_models = []
for_df = []


def to_num(x):
    try: return int(x)
    except: return float(x)

    
def id_str_to_folder(id_str):
    return id_str.replace(".", "__")
beta = 1


LOSS_WEIGHTS = {'balance': 0.25,
 'td_sc':1.,
 'year': 0.5,
 'month': 0.15,
 'day': 0.25,
 'dow': 0.1,
 'tcode_num': 1.,
 'log_amount_sc': 2.}

lws = [LOSS_WEIGHTS]

# td_loss_fns = [(poisson_loss, "poisson_loss"), (expon_loss, "expon_loss"), (loss_mse, "loss_mse")]
td_loss_fns = [(loss_mse, "loss_mse")]

for i in range(1):
    for dff in [64]:
        for td_loss_fn, name in td_loss_fns:
            for d_model in [64]:
                for num_heads in [4]:
                
                    loss_td = td_loss_fn

                
                    print(datetime.datetime.now().strftime("%H:%M"))


                    transformer = Transformer(
                        num_layers_enc=num_layers_enc, num_layers_dec=num_layers_dec,
                        d_model=d_model,
                        num_heads=num_heads,
                        dff=dff,
                        maximum_position_encoding=256,
                       net_info = FIELD_DIMS.items(), 
                        inp_dim = n_feat_inp,
                        final_dim= max(n_feat_tar, n_feat_inp),
                        config=config,
                        rate=dr)
                    
                    optimizer = tf.keras.optimizers.Adam()
                    transformer.optimizer =  optimizer
                    
                    
#                     transformer.loss_function = loss_function
#                     transformer.LOSS_WEIGHTS = LOSS_WEIGHTS

                    id_str = f"num_layers_dec_{num_layers_dec}-d_model_{d_model}-num_heads_{num_heads}-i_{i}\
-dr_{dr}-dff_{dff}-opt_{opt_name}-l_{name}_lw2"
                    
                    print("Begin running", id_str)
                    transformer.id_str = id_str


                    all_models.append(transformer)
                    transformer.compile()
                    
                    
                    transformer.checkpoint_path = f"./checkpoints/{id_str_to_folder(transformer.id_str)}--{nb_id}"
                    transformer.ckpt = tf.train.Checkpoint(transformer=transformer,
                                               optimizer=optimizer)
                    transformer.ckpt_manager = tf.train.CheckpointManager(transformer.ckpt, 
                                                                          transformer.checkpoint_path, max_to_keep=EARLY_STOP)
                    
                    if transformer.ckpt_manager.latest_checkpoint:
                        transformer.ckpt.restore(transformer.ckpt_manager.latest_checkpoint)
                        print('Latest checkpoint restored!!')    
                        continue
                    else:
                        print("Error - could not load", id_str)


20:09
Begin running num_layers_dec_4-d_model_64-num_heads_4-i_0-dr_0.1-dff_64-opt_adam-l_loss_mse_lw2
Latest checkpoint restored!!


### Results

In [13]:
import os
from pathlib import Path

sorted(Path("generation_results/").iterdir(), key=os.path.getmtime)

[PosixPath('generation_results/.ipynb_checkpoints'),
 PosixPath('generation_results/.DS_Store'),
 PosixPath('generation_results/df15_58.csv'),
 PosixPath('generation_results/df18_17.csv'),
 PosixPath('generation_results/df20_04.csv')]

In [14]:
[(x, time.ctime(os.path.getmtime(x))) for x in sorted(Path("generation_results/").iterdir(), key=os.path.getmtime)]

[(PosixPath('generation_results/.ipynb_checkpoints'),
  'Sun May 16 18:28:18 2021'),
 (PosixPath('generation_results/.DS_Store'), 'Mon May 31 09:58:44 2021'),
 (PosixPath('generation_results/df15_58.csv'), 'Fri Jun  4 15:58:09 2021'),
 (PosixPath('generation_results/df18_17.csv'), 'Fri Jun  4 18:17:45 2021'),
 (PosixPath('generation_results/df20_04.csv'), 'Fri Jun  4 20:04:51 2021')]

In [15]:
df_path = sorted(Path("generation_results/").iterdir(), key=os.path.getmtime)[-1]
df_path

PosixPath('generation_results/df20_04.csv')

In [16]:
result_df = pd.read_csv(df_path) #.sort_values(by="val loss")
with pd.option_context('display.max_colwidth', None, "display.max_rows", None, "display.max_columns", None):
    display(result_df)

Unnamed: 0.1,Unnamed: 0,num_layers_dec,d_model,num_heads,i,dr,beta,dff,loss name,val loss,opt name,id_str
0,0,4,64,4,0,0.1,1,64,loss_mse,6.342586,adam,num_layers_dec_4-d_model_64-num_heads_4-i_0-dr_0.1-dff_64-opt_adam-l_loss_mse_lw2


In [17]:
best_ind = result_df["val loss"].idxmax()

In [18]:
transformer = all_models[best_ind]

# Generate

In [19]:
MAX_YEARS_SPAN = 10

## Set up

In [20]:
tf.keras.layers.Dense

tensorflow.python.keras.layers.core.Dense

In [21]:
FIELD_DIMS.items()

dict_items([('td_sc', 1), ('month', 2), ('day', 2), ('dow', 2), ('tcode_num', 61), ('log_amount_sc', 1)])

In [22]:
from my_lib.encoding import encode_time_value
#, decode_time_value

clocks = {}
for max_val in [7, 31, 12]:
    cmd = f"clocks[{max_val}] = np.array([encode_time_value(val, {max_val}) for val in range({max_val})])"
    print("Running", cmd)
    exec(cmd)
    
clocks.keys()

Running clocks[7] = np.array([encode_time_value(val, 7) for val in range(7)])
Running clocks[31] = np.array([encode_time_value(val, 31) for val in range(31)])
Running clocks[12] = np.array([encode_time_value(val, 12) for val in range(12)])


dict_keys([7, 31, 12])

In [23]:
# from scipy.special import factorial
from scipy.stats import norm


#############  Signatures of pdf/pmfs are (pred, real), this is opposite of losses  #############

# def pmf_poisson(l, k):
#     return l**k * np.exp(-l) / factorial(k)


# def expon_pdf(l, x):
#     return l * np.exp(-l*x)


def norm_pdf(mean, x):
    return norm.pdf(x, loc=mean)
    
pmf = norm_pdf

In [24]:
START_DATE = data_encoder.START_DATE 

if type(START_DATE) == str:
    START_DATE = datetime.datetime.strptime(START_DATE, "%Y-%m-%d").date()
    
    

END_DATE = START_DATE.replace(year = START_DATE.year+ MAX_YEARS_SPAN)

ALL_DATES = [START_DATE + datetime.timedelta(i) for i in range((END_DATE - START_DATE).days)]

AD = np.array([(d.month % 12, d.day % 31, d.weekday() % 7, i, d.year) for i, d in enumerate(ALL_DATES)])

In [25]:
FIELD_STARTS

{'td_sc': 0,
 'month': 1,
 'day': 3,
 'dow': 5,
 'tcode_num': 7,
 'log_amount_sc': 68}

In [26]:
from my_lib.transformer_core import create_masks

    
def reencode_net_prediction(net_name, predictions):
    
    date_info = {'month':12, 'day':31, 'dow':7}
    batch_size = predictions.shape[0]
    
    if net_name in ['balance', 'td_sc', 'dss', "log_amount_sc"]:
        return predictions
    
#     elif net_name == "year":
#         return tf.round(predictions/YEAR_SCALE)*YEAR_SCALE
    
    elif net_name in date_info.keys():
        return bulk_nearest_clock_enc(predictions, max_val=date_info[net_name])
    
    elif net_name == "tcode_num":
        tcode_len = ONE_HOT_DIMS["tcode_num"]
        choices = np.arange(tcode_len)
        ps = tf.nn.softmax(predictions, axis=2).numpy().reshape(-1, data_encoder.n_tcodes)
        choosen =  np.reshape([np.random.choice(choices, p=p) for p in ps], newshape=(batch_size, -1))
        return tf.one_hot(choosen, depth=tcode_len)
    
    else:
        raise Exception(f"Got invalid net_name: {net_name}")

days_per_month = np.array([(datetime.date(1990, month, 1) - datetime.timedelta(1)).day for month in range(1,13)]) # 0 = dec


@np.vectorize
def get_short_name(tcode):
    return short_names[tcode]

# @np.vectorize
# def get_date_str(yyyy, mm, dd):
#     return f"{yyyy}/{mm:02d}/{dd:02d}"

@np.vectorize
def get_date_str(mm, dd):
    return f"{mm:02d}/{dd:02d}"


def bulk_decode(seqs, start_dates, return_single_df=False, return_df_list=False):
    
    # *****
#     ages = age_scaler.inverse_transform(seqs[:, 0, :])
    ages = seqs[:, 0, :] * data_encoder.ATTR_SCALE
    seqs = seqs[:, 1:, :]
    assert np.sum(np.diff(ages)) == 0, f"Bad formating, expected all entries same in each row, got {ages}"


    
    amts = seqs[:, :, FIELD_STARTS["log_amount_sc"]].numpy() * data_encoder.LOG_AMOUNT_SCALE
    amts = 10 ** amts
    amts = np.round(amts - 1.0, 2)

    n_seqs, n_steps = amts.shape
    account_ids = np.repeat(np.arange(n_seqs)[:,None], n_steps, axis=1)

    days_passed = np.round(seqs[:, :, FIELD_STARTS["td_sc"]] *data_encoder.TD_SCALE ).astype(int)
  

#     years = np.round(seqs[:, :, FIELD_STARTS["year"]]/ YEAR_SCALE).astype(int) + START_YEAR

    months = bulk_nearest_clock_ind(seqs[:, :, FIELD_STARTS["month"]: FIELD_STARTS["month"] +2], 12)
    
    days = bulk_nearest_clock_ind(seqs[:, :, FIELD_STARTS["day"]: FIELD_STARTS["day"] +2], 31)
    days[days==0] = days_per_month[months[days==0]]
    months[months==0] = 12 # needs to be done after days (above)
    date_fields = get_date_str(months, days)
    
    dpc = np.cumsum(days_passed, axis=1) 
    dates = np.array([[start_dates[i] + datetime.timedelta(int(d)) for d in dpc[i]]for i in range(len(start_dates))])
    
    tcode_inds = np.argmax(seqs[:, :, FIELD_STARTS["tcode_num"]: FIELD_STARTS["tcode_num"] + FIELD_DIMS["tcode_num"]], axis=-1)
#     tcodes = get_short_name(tcode_inds)

    ages = np.repeat(ages[:, 0:1], amts.shape[1], axis=1).astype(int)
    
    return_vals = amts, tcode_inds, date_fields, days_passed, ages, dates, account_ids
    return_lbls = "amount", "tcode_nums", "date_fields", "days_passed", "age", "date", "account_id"

#     print("Shapes of amts, tcode_inds, dates, days_passed, ages\n", 
#           amts.shape, tcode_inds.shape, dates.shape, days_passed.shape, ages.shape)
#     print("days_passed", days_passed, type(days_passed))
    
    if return_df_list:
        return [pd.DataFrame.from_records(zip(*x), columns=return_lbls) for x in zip(*return_vals)]
    
    if return_single_df:
        return pd.DataFrame.from_records([x for x in zip(*[x.reshape(-1) for x in return_vals])], columns=return_lbls)
    
    return return_vals



def nearest_clock_ind(enc, max_val):
    clock = clocks[max_val]
    diffs = clock - enc
    d_sq =  np.sum(diffs**2, axis=1)
    return np.argmin(d_sq)


def nearest_clock_enc(enc, max_val):
    clock = clocks[max_val]
    diffs = clock - enc
    d_sq =  np.sum(diffs**2, axis=1)
    return clock[np.argmin(d_sq)]


def bulk_nearest_clock_ind(encs, max_val):
    batch_size = encs.shape[0]
    inds =  np.array([nearest_clock_ind(enc, max_val) 
                      for enc in tf.reshape(encs, shape=(-1, 2))])
    return inds.reshape((batch_size, -1))


def bulk_nearest_clock_enc(encs, max_val):

    batch_size = encs.shape[0]
    new_encs =  np.array([nearest_clock_enc(enc, max_val) 
                      for enc in tf.reshape(encs, shape=(-1, 2))])
    
    return new_encs.reshape((batch_size, -1, 2))

In [27]:
def generate_seqs(length, ages, start_dates, greedy_dates = False, return_single_df=False, return_df_list=False):
    
    if return_single_df and return_df_list:
        raise Exception("At most one of: 'return_single_df' and 'return_df_list' can be true")
    
    date_inds = np.array([(d - START_DATE).days for d in start_dates])
    
    max_length = length

    output = np.repeat(np.array(ages)[:, None, None], repeats=n_feat_inp, axis=2) / data_encoder.ATTR_SCALE
    
    raw_preds = []
    raw_preds.append(output)

    date_info = None
    
    
    for i in range(max_length):


        combined_mask, dec_padding_mask = create_masks(output)

        predictions, attn, raw_ps, date_inds, enc_preds, date_info = call_to_generate(transformer, output, 
                                                 True, 
                                                 combined_mask, 
                                                 dec_padding_mask, date_inds, date_info, greedy_dates =greedy_dates)

        
        raw_preds.append(raw_ps)

        enc_preds = tf.reshape(tf.constant(enc_preds), shape=(-1,1, n_feat_inp))

        output = tf.concat([output, enc_preds], axis=1)

        
    return bulk_decode(output, start_dates, return_single_df, return_df_list), output, raw_preds




## Forward pass through transformer
# 
# Returns: preds, attn_w, raw_preds, inds
# the returned preds have multiple timesteps, but we only 
# care about the last (it's the only new one)
def call_to_generate(transformer, tar, training,
           look_ahead_mask, dec_padding_mask, start_inds, prev_date_info=None, greedy_dates = True):
    

    ### Pass through decoder stack ###
    dec_output, attention_weights = transformer.decoder(
        tar, training, look_ahead_mask, dec_padding_mask)


    final_output = transformer.final_layer(dec_output) 

    
    
    ### Predict each field  ###
    preds = {}
    raw_preds = {}
    encoded_preds = []
    
    
    ## Pre date fields 
    for net_name in transformer.pre_date_order:  
        
        pred = transformer.__getattribute__(net_name)(final_output)
        raw_preds[net_name] = pred
        
        pred = reencode_net_prediction(net_name, pred) # keeps time step
        preds[net_name] = pred
        
        
        encoded_preds.append(pred[:,-1,:])
        final_output = tf.concat([final_output, pred], axis=2)
        
        
    ## Date fields
    date_parts = {}
    for net_name in transformer.date_fields:  
        
        pred = transformer.__getattribute__(net_name)(final_output)
        raw_preds[net_name] = pred
        
    # Combine info from all predicted date fields (day, month, dow, td)
    pred_date, inds = raw_dates_to_reencoded(raw_preds, start_inds, greedy_decode =greedy_dates)
    preds["date"] = pred_date

    
    encoded_preds.append(pred_date[:,-1,:])
    
    
    # Note to self -> what does this do?
    if not prev_date_info is None:   # For first step may be None, or a starting date
        pred_date = tf.concat([prev_date_info, pred_date], axis=1)
        
        

    final_output = tf.concat([final_output, pred_date], axis=2)  
    
          
    ## Post date fields
    for net_name in transformer.post_date_order:  
#         print(net_name)
        pred = transformer.__getattribute__(net_name)(final_output)
#         print(pred.shape)
        raw_preds[net_name] = pred
        
        pred = reencode_net_prediction(net_name, pred)
        preds[net_name] = pred
        
        encoded_preds.append(pred[:,-1,:])
        final_output = tf.concat([final_output, pred], axis=-1)   
        
    
#     print("start_inds + inds \n", start_inds + inds)
#     print("\n\npred_date\n", pred_date)
#     print("\n"*5)
        
    return preds, attention_weights, raw_preds, start_inds + inds, tf.expand_dims(tf.concat(encoded_preds, axis=1), axis=1), pred_date


In [28]:
PMF_EPS = 1e-6

# Takes raw predictions (info about predicted day, month, dow, and days passed) and start inds 
# (indicate the current date for each of the seqs) 
# Computes a number of days passed for each based on inputs (either greedily or with sampling)
# returns the new_dates (old_dates + days passed) and their indicies
def raw_dates_to_reencoded(raw, start_inds,  max_days = 100, greedy_decode=False):
    
    all_ps = [clock_to_onehot(k, raw[k][:,-1]) for k in ["month", "day", "dow"]]

    timesteps = np.zeros(len(start_inds)).astype(int)

    for i, (month_ps, day_ps, dow_ps, l_pred, si) in enumerate(zip(*all_ps, raw["td_sc"][:,-1].numpy(), start_inds)):

        ps = month_ps[AD[si:si+max_days,0]]*day_ps[AD[si:si+max_days,1]]*dow_ps[AD[si:si+max_days,2]] * \
                pmf(max(PMF_EPS, l_pred)*data_encoder.TD_SCALE, AD[si:si+max_days,3]-si ) 

        
        if greedy_decode:
            timesteps[i] = np.argmax(ps)
        else:
#             print("max_days", "len(ps)" ,max_days, len(ps))
            timesteps[i] = np.random.choice(max_days, p=ps/sum(ps))
        
        
    inds = start_inds + timesteps
    

    return tf.expand_dims(
                tf.concat([tf.expand_dims(
                           timesteps.astype(np.float32)/ data_encoder.TD_SCALE, axis=1), 
#                            AD[inds, 4:5]*YEAR_SCALE,
                           bulk_encode_time_value(AD[inds, 0], 12),
                           bulk_encode_time_value(AD[inds, 1], 31),
                           bulk_encode_time_value(AD[inds, 2], 7)
              ], axis=1), axis=1), timesteps

## Generate example

In [29]:
AD[[0,3], :]

array([[   6,    1,    0,    0, 2020],
       [   6,    4,    3,    3, 2020]])

In [30]:
data_encoder.n_tcodes

61

In [31]:
seqs_dfs, seqs, raw = generate_seqs(length= 25, 
                          ages=[75, 25], 
                          start_dates=[START_DATE, START_DATE+datetime.timedelta(days=1)], 
                          greedy_dates=False,
                          return_df_list=True)

In [32]:
np.repeat(np.arange(2)[:,None], 25, axis=1)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]])

In [33]:
seqs_dfs[0]

Unnamed: 0,amount,tcode_nums,date_fields,days_passed,age,date,account_id
0,52.450001,0,06/01,0,75,2020-06-01,0
1,31.790001,0,06/01,0,75,2020-06-01,0
2,48.439999,0,06/04,3,75,2020-06-04,0
3,41.919998,0,06/06,2,75,2020-06-06,0
4,100.400002,13,06/08,2,75,2020-06-08,0
5,34.459999,39,06/10,2,75,2020-06-10,0
6,43.610001,0,06/10,0,75,2020-06-10,0
7,41.220001,0,06/11,1,75,2020-06-11,0
8,58.689999,3,06/13,2,75,2020-06-13,0
9,43.459999,0,06/14,1,75,2020-06-14,0


# Generate Full dataset

In [34]:
transformer.id_str

'num_layers_dec_4-d_model_64-num_heads_4-i_0-dr_0.1-dff_64-opt_adam-l_loss_mse_lw2'

In [35]:
start_dates = np.random.choice([START_DATE + datetime.timedelta(i) for i in range(365)], size=n_seqs_to_generate)
start_dates

array([datetime.date(2021, 4, 16), datetime.date(2021, 3, 4),
       datetime.date(2021, 4, 10), ..., datetime.date(2021, 4, 23),
       datetime.date(2021, 4, 20), datetime.date(2021, 4, 26)],
      dtype=object)

In [36]:
seq_ages = np.random.choice(attributes, size=n_seqs_to_generate)
seq_ages

array([66., 38., 40., ..., 60., 68., 35.])

In [37]:


start = time.time()
full_df, seqs, raw = generate_seqs(length= seq_len, 
                                   ages=seq_ages, 
                                   start_dates= start_dates, 
                                   return_single_df=True )

print(f"took {time.time() - start} secs to generate")

save_as = f"generated_data/gen_{id_str_to_folder(transformer.id_str)}--{nb_id}-len_{seq_len}.csv"


full_df.to_csv(save_as)
print("Wrote df to", save_as)

took 1048.3750441074371 secs to generate
Wrote df to generated_data/gen_num_layers_dec_4-d_model_64-num_heads_4-i_0-dr_0__1-dff_64-opt_adam-l_loss_mse_lw2--vf1-len_80.csv


In [38]:
full_df

Unnamed: 0,amount,tcode_nums,date_fields,days_passed,age,date,account_id
0,145.759995,4,04/18,2,66,2021-04-18,0
1,115.199997,12,04/18,0,66,2021-04-18,0
2,90.959999,12,04/18,0,66,2021-04-18,0
3,49.650002,0,04/20,2,66,2021-04-20,0
4,58.180000,5,04/22,2,66,2021-04-22,0
...,...,...,...,...,...,...,...
324315,39.910000,16,07/12,1,35,2021-07-12,4053
324316,111.900002,12,07/12,0,35,2021-07-12,4053
324317,40.860001,0,07/12,0,35,2021-07-12,4053
324318,35.700001,0,07/14,2,35,2021-07-14,4053
