# Intro

For generating dataset of shape (n_samples, max_seq_len, feats_per_step) from the original dataframe 

This version is the default

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os

import pickle
from datetime import date

### Setup

In [2]:
ds_suffix = "-czech"
max_seq_len = 80
min_seq_len = 20

# Process

In [3]:
df = pd.read_csv("data/tr_by_acct_w_age.csv")
df

Unnamed: 0.1,Unnamed: 0,column_a,account_id,date,type,operation,amount,balance,k_symbol,age,tcode
0,0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,,29,cash_cr
1,1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,,29,bank_cr
2,2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,,29,cash_cr
3,3,162681,1,950430,CREDIT,,19.2,17298.2,INTEREST CREDITED,29,interest_cr
4,4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,,29,bank_cr
...,...,...,...,...,...,...,...,...,...,...,...
1056315,1056315,1033141,11382,981202,DEBIT,CASH WITHDRAWAL,25600.0,41114.4,,46,cash_db_nan
1056316,1056316,1040574,11382,981210,CREDIT,COLLECTION FROM ANOTHER BANK,46248.0,87362.4,,46,bank_cr
1056317,1056317,1050362,11382,981225,DEBIT,CASH WITHDRAWAL,6300.0,81062.4,,46,cash_db_nan
1056318,1056318,1053037,11382,981231,CREDIT,,311.3,81373.6,INTEREST CREDITED,46,interest_cr


##### Sort by acct 

In [4]:
df = df.sort_values(by = ["account_id", "date"])

##### Date

In [5]:
from datetime import datetime
czech_date_parser = lambda x: datetime.strptime(str(x), "%y%m%d")


df["datetime"] = df["date"].apply(czech_date_parser)

iso = df["datetime"].dt.isocalendar()

df["month"] = df["datetime"].dt.month
df["day"] = df["datetime"].dt.day
df["dow"] =  df["datetime"].dt.dayofweek
df["year"] = df["datetime"].dt.year
df

Unnamed: 0.1,Unnamed: 0,column_a,account_id,date,type,operation,amount,balance,k_symbol,age,tcode,datetime,month,day,dow,year
0,0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,,29,cash_cr,1995-03-24,3,24,4,1995
1,1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,,29,bank_cr,1995-04-13,4,13,3,1995
2,2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,,29,cash_cr,1995-04-23,4,23,6,1995
3,3,162681,1,950430,CREDIT,,19.2,17298.2,INTEREST CREDITED,29,interest_cr,1995-04-30,4,30,6,1995
4,4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,,29,bank_cr,1995-05-13,5,13,5,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,1056315,1033141,11382,981202,DEBIT,CASH WITHDRAWAL,25600.0,41114.4,,46,cash_db_nan,1998-12-02,12,2,2,1998
1056316,1056316,1040574,11382,981210,CREDIT,COLLECTION FROM ANOTHER BANK,46248.0,87362.4,,46,bank_cr,1998-12-10,12,10,3,1998
1056317,1056317,1050362,11382,981225,DEBIT,CASH WITHDRAWAL,6300.0,81062.4,,46,cash_db_nan,1998-12-25,12,25,4,1998
1056318,1056318,1053037,11382,981231,CREDIT,,311.3,81373.6,INTEREST CREDITED,46,interest_cr,1998-12-31,12,31,3,1998


##### Time delta

In [6]:
df["td"] = df[["account_id", "datetime"]].groupby("account_id").diff()
df["td"] = df["td"].apply(lambda x: x.days)
df["td"].fillna(0.0, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,column_a,account_id,date,type,operation,amount,balance,k_symbol,age,tcode,datetime,month,day,dow,year,td
0,0,149432,1,950324,CREDIT,CREDIT IN CASH,1000.0,1000.0,,29,cash_cr,1995-03-24,3,24,4,1995,0.0
1,1,157404,1,950413,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,4679.0,,29,bank_cr,1995-04-13,4,13,3,1995,20.0
2,2,158832,1,950423,CREDIT,CREDIT IN CASH,12600.0,17279.0,,29,cash_cr,1995-04-23,4,23,6,1995,10.0
3,3,162681,1,950430,CREDIT,,19.2,17298.2,INTEREST CREDITED,29,interest_cr,1995-04-30,4,30,6,1995,7.0
4,4,167083,1,950513,CREDIT,COLLECTION FROM ANOTHER BANK,3679.0,20977.2,,29,bank_cr,1995-05-13,5,13,5,1995,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,1056315,1033141,11382,981202,DEBIT,CASH WITHDRAWAL,25600.0,41114.4,,46,cash_db_nan,1998-12-02,12,2,2,1998,2.0
1056316,1056316,1040574,11382,981210,CREDIT,COLLECTION FROM ANOTHER BANK,46248.0,87362.4,,46,bank_cr,1998-12-10,12,10,3,1998,8.0
1056317,1056317,1050362,11382,981225,DEBIT,CASH WITHDRAWAL,6300.0,81062.4,,46,cash_db_nan,1998-12-25,12,25,4,1998,15.0
1056318,1056318,1053037,11382,981231,CREDIT,,311.3,81373.6,INTEREST CREDITED,46,interest_cr,1998-12-31,12,31,3,1998,6.0


# Write

In [7]:
folders = ['generated_data',
 'generated_data/parts',
 'stored_data',
 'models',
 'models/checkpoints',
 'models/info',
 'generation_results',
 'data',
 'my_lib']


for f in folders:
    if not os.path.exists(f):
        os.mkdir(f)

In [8]:
df.to_csv(f"stored_data/final_df-{ds_suffix}.csv", index=False)