In [1]:
import importlib
import ServingColumns
importlib.reload(ServingColumns)

<module 'ServingColumns' from '/home/nakyung/projects/BDAIFin/OFFLINE/transaction/ServingColumns.py'>

In [2]:
from pathlib import Path
import pandas as pd

from ServingColumns import fit_stage1_artifacts, build_stage1_dataset, save_json


TRAIN_IN = Path("../../DATA/dataset/train_stage1")   
TEST_IN  = Path("../../DATA/dataset/test_stage1")

ART_PATH = Path("../../DATA/artifacts/stage1_artifacts.json")

TRAIN_OUT = Path("../../DATA/dataset/TRAIN_stage1")
TEST_OUT  = Path("../../DATA/dataset/TEST_stage1")

print("TRAIN_IN:", TRAIN_IN.resolve())
print("TEST_IN :", TEST_IN.resolve())
print("ART_PATH:", ART_PATH.resolve())
print("TRAIN_OUT:", TRAIN_OUT.resolve())
print("TEST_OUT :", TEST_OUT.resolve())


TRAIN_IN: /home/nakyung/projects/BDAIFin/DATA/dataset/train_stage1
TEST_IN : /home/nakyung/projects/BDAIFin/DATA/dataset/test_stage1
ART_PATH: /home/nakyung/projects/BDAIFin/DATA/artifacts/stage1_artifacts.json
TRAIN_OUT: /home/nakyung/projects/BDAIFin/DATA/dataset/TRAIN_stage1
TEST_OUT : /home/nakyung/projects/BDAIFin/DATA/dataset/TEST_stage1


In [3]:
# Load (parquet)

df_train_raw = pd.read_parquet(TRAIN_IN)
df_test_raw  = pd.read_parquet(TEST_IN)

print("train raw shape:", df_train_raw.shape)
print("test  raw shape:", df_test_raw.shape)


train raw shape: (5332979, 23)
test  raw shape: (932762, 23)


In [4]:
# Fit artifacts on train

artifacts = fit_stage1_artifacts(df_train_raw)

ART_PATH.parent.mkdir(parents=True, exist_ok=True)
save_json(artifacts, str(ART_PATH))

print("saved artifacts:", ART_PATH)
print("base_rate:", artifacts.get("base_rate"))
print("high_amount_thr:", artifacts.get("high_amount_thr"))
print("highrisk_mcc:", len(artifacts.get("highrisk_mcc", [])))
print("high_risk_days:", artifacts.get("high_risk_days"))


saved artifacts: ../../DATA/artifacts/stage1_artifacts.json
base_rate: 0.0014481587120444314
high_amount_thr: 4.706281661987305
highrisk_mcc: 44
high_risk_days: [0, 4, 6]


In [5]:

# Build stage1 datasets

df_train_s1 = build_stage1_dataset(df_train_raw, artifacts)
df_test_s1  = build_stage1_dataset(df_test_raw, artifacts)

# train/test columns must match
assert list(df_train_s1.columns) == list(df_test_s1.columns), "train/test stage1 columns mismatch!"

print("train s1 shape:", df_train_s1.shape)
print("test  s1 shape:", df_test_s1.shape)
print("stage1 cols:", df_train_s1.columns.tolist())



  (amt_cumsum.to_numpy() / cnt_past.to_numpy()),
  (card_tx_1h_cumsum.to_numpy() / card_tx_cnt_past.to_numpy()),
  (amt_cumsum.to_numpy() / cnt_past.to_numpy()),
  (card_tx_1h_cumsum.to_numpy() / card_tx_cnt_past.to_numpy()),


train s1 shape: (5332979, 27)
test  s1 shape: (932762, 27)
stage1 cols: ['id', 'fraud', 'log_abs_amount', 'high_amount', 'amount_vs_client_avg_diff', 'amount_deviation', 'has_error', 'err_bad_cvv', 'err_bad_card_number', 'err_bad_expiration', 'card_error_last1', 'client_error_last1', 'card_fraud_last1', 'client_fraud_last1', 'card_fraud_last3', 'tx_hour', 'tx_month', 'hour_cos', 'is_highrisk_weekday', 'seconds_since_prev_tx', 'card_velocity_spike_ratio', 'mcc_highrisk_90', 'card_mcc_is_new', 'client_mcc_is_new', 'card_merchant_is_new', 'client_merchant_is_new', 'merchant_is_new_x_has_error']


In [6]:

TRAIN_OUT.parent.mkdir(parents=True, exist_ok=True)
TEST_OUT.parent.mkdir(parents=True, exist_ok=True)

train_out_file = TRAIN_OUT 
test_out_file  = TEST_OUT

df_train_s1.to_parquet(train_out_file, index=False)
df_test_s1.to_parquet(test_out_file, index=False)

mem_train = df_train_s1.memory_usage(deep=True).sum() / 1024**2
mem_test  = df_test_s1.memory_usage(deep=True).sum() / 1024**2

print("saved:", train_out_file, "| mem(MB):", round(mem_train, 2))
print("saved:", test_out_file,  "| mem(MB):", round(mem_test, 2))


saved: ../../DATA/dataset/TRAIN_stage1 | mem(MB): 264.47
saved: ../../DATA/dataset/TEST_stage1 | mem(MB): 46.26
