# Imports


In [8]:
%pip install google-cloud-storage



In [9]:
from google.colab import auth
auth.authenticate_user()
from google.cloud import storage
import os
import subprocess

In [10]:
import argparse
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import json
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm.notebook import tqdm

# Dataset




1.   **generated dataset**

- step
- action
- amount
- nameOrig
- oldBalanceOrig
- newBalanceOrig [DROP]
- nameDest
- oldBalanceDest [DROP]
- newBalanceDest [DROP]
- isFraud [DROP]
- isFlaggedFraud [DROP]
- isUnauthorizedOverdraft [DROP]


---


2.   **https://github.com/spendcastai/bernhackt-2025**

- transaction_id
- transaction_name
- amount
- date
- type
- status
- card_number
- category

---



Maybe consider as well https://www.kaggle.com/datasets/priyamchoksi/credit-card-transactions-dataset

# Hyperparameters

In [35]:
SEQUENCE_LENGTH = 50
FORECAST_HORIZON = 10
BATCH_SIZE = 2048

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_FEATURES = 14  # 2 categorical IDs + 12 numerical features
OUTPUT_FEATURES = 3 # amount, category_id, merchant_id
EMBEDDING_DIM = 128
HIDDEN_DIM = 512
NUM_LAYERS = 4
DROPOUT_PROB = 0.2
LEARNING_RATE = 0.003
NUM_EPOCHS = 10

#  Data-loading

In [12]:
def process_raw_data(df: pd.DataFrame):
    is_income = df['action'].str.contains("INCOME", case=False)

    df['amount'] = df['amount'] * is_income.replace({True: 1, False: -1})
    df['user_id'], user_vocab = pd.factorize(df['nameOrig'])
    df['category_id'], cat_vocab = pd.factorize(df['action'])
    df['merchant_id'], merch_vocab = pd.factorize(df['nameDest'])
    df = df.rename(columns={"oldBalanceOrig": "balance_before"})
    df = df.sort_values(by=['user_id', 'step'])
    df['time_delta'] = df.groupby('user_id')['step'].diff().fillna(0)
    df['time_delta_category'] = df.groupby(['user_id', 'category_id'])['step'].diff().fillna(0)
    df['time_delta_merchant'] = df.groupby(['user_id', 'merchant_id'])['step'].diff().fillna(0)
    expanding_mean = df.groupby(['user_id', 'merchant_id'])['amount'].expanding().mean()
    df['avg_amount_merchant'] = expanding_mean.reset_index(level=[0,1], drop=True)
    df['avg_amount_merchant'] = df.groupby(['user_id', 'merchant_id'])['avg_amount_merchant'].shift(1).fillna(0)

    df['datetime'] = pd.to_datetime(df['step'], unit='D', origin='2024-01-01')
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['month_of_year'] = df['datetime'].dt.month
    df['day_of_month'] = df['datetime'].dt.day
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_month_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
    df['day_of_month_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)
    df['month_of_year_sin'] = np.sin(2 * np.pi * df['month_of_year'] / 12)
    df['month_of_year_cos'] = np.cos(2 * np.pi * df['month_of_year'] / 12)

    numerical_features = [
        'amount', 'balance_before', 'time_delta',
        'time_delta_category', 'time_delta_merchant', 'avg_amount_merchant',
        'day_of_week_sin', 'day_of_week_cos',
        'day_of_month_sin', 'day_of_month_cos',
        'month_of_year_sin', 'month_of_year_cos'
    ]

    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

    features_to_keep = [
        'user_id', 'category_id', 'merchant_id'
    ] + numerical_features

    vocab_mappings = {
        'categories': list(cat_vocab),
        'merchants': list(merch_vocab)
    }

    return df[features_to_keep], vocab_mappings, scaler

In [13]:
def create_sequences(df: pd.DataFrame, sequence_length: int, forecast_horizon: int):
    input_features = [
        'amount', 'balance_before', 'category_id', 'merchant_id', 'time_delta',
        'time_delta_category', 'time_delta_merchant', 'avg_amount_merchant',
        'day_of_week_sin', 'day_of_week_cos',
        'day_of_month_sin', 'day_of_month_cos',
        'month_of_year_sin', 'month_of_year_cos'
    ]

    target_features = ['amount', 'category_id', 'merchant_id', 'time_delta']

    all_sequences_X = []
    all_sequences_y = []

    total_users = df['user_id'].nunique()
    for i, user_id in enumerate(df['user_id'].unique()):
        user_df = df[df['user_id'] == user_id]

        if len(user_df) < sequence_length + forecast_horizon:
            continue

        user_X_data = user_df[input_features].values
        user_y_data = user_df[target_features].values

        for j in range(len(user_df) - sequence_length - forecast_horizon + 1):
            start_idx = j
            mid_idx = j + sequence_length
            end_idx = mid_idx + forecast_horizon

            all_sequences_X.append(user_X_data[start_idx:mid_idx])
            all_sequences_y.append(user_y_data[mid_idx:end_idx])

    return all_sequences_X, all_sequences_y

In [14]:
bucket_name = 'bernhackt'
source_data_path = f'gs://{bucket_name}/t2.csv'
local_source_path = Path('t2.csv')
processed_data_path = f'gs://{bucket_name}/processed_data.zip'
local_zip_path = Path('processed_data.zip')
local_data_dir = Path('processed_data')

sequences_X_path = local_data_dir / 'sequences_X.npy'
sequences_y_path = local_data_dir / 'sequences_y.npy'
vocab_path = local_data_dir / 'vocab.json'
scaler_path = local_data_dir / 'scaler.pkl'

gcs_check_command = f"gcloud storage ls {processed_data_path}"
result = subprocess.run(gcs_check_command, shell=True, capture_output=True)

if result.returncode == 0:
    # Data exists
    print(f"✅ Found 'processed_data.zip' in GCS bucket '{bucket_name}'.")
    print("Downloading to the Colab environment...")
    !gcloud storage cp {processed_data_path} {local_zip_path}

    print("Unzipping data...")
    !unzip -qo {local_zip_path}
    print("Data is ready.")

else:
    # Data does not exists
    print(f"❌ 'processed_data.zip' not found in GCS bucket '{bucket_name}'.")
    print("Processing raw data from scratch...")

    !gcloud storage cp {source_data_path} {local_source_path}

    raw_df = pd.read_csv(local_source_path)
    processed_df, vocab_mappings, scaler = process_raw_data(raw_df)

    print("Creating sequences...")
    sequences_X_list, sequences_y_list = create_sequences(processed_df, SEQUENCE_LENGTH, FORECAST_HORIZON)

    print("Converting sequences to contiguous NumPy arrays. This may take a moment...")
    sequences_X = np.array(sequences_X_list, dtype=np.float32)
    sequences_y = np.array(sequences_y_list, dtype=np.float32)

    del sequences_X_list, sequences_y_list, processed_df, raw_df

    local_data_dir.mkdir(parents=True, exist_ok=True)

    print("Saving correctly formatted .npy files to the local Colab environment...")
    np.save(sequences_X_path, sequences_X)
    np.save(sequences_y_path, sequences_y)
    with open(vocab_path, 'w') as f:
        json.dump(vocab_mappings, f, indent=4)
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)

    del sequences_X, sequences_y

    print(f"Zipping the '{local_data_dir}' directory...")
    !zip -r {local_zip_path} {local_data_dir}

    print(f"Uploading '{local_zip_path}' to GCS for backup...")
    !gcloud storage cp {local_zip_path} {processed_data_path}
    print("Upload complete. Data is ready.")

✅ Found 'processed_data.zip' in GCS bucket 'bernhackt'.
Downloading to the Colab environment...
Copying gs://bernhackt/processed_data.zip to file://processed_data.zip

Average throughput: 76.3MiB/s
Unzipping data...
Data is ready.


In [15]:
with open(vocab_path, 'r') as f:
    vocab_mappings = json.load(f)

with open(scaler_path, 'rb') as f:
    scaler = pickle.load(f)

vocab_sizes = {
    'categories': len(vocab_mappings['categories']),
    'merchants': len(vocab_mappings['merchants'])
}

print("Vocab mappings and scaler loaded into memory.")

Vocab mappings and scaler loaded into memory.


In [36]:
class Dataset(Dataset):
    def __init__(self, x_path, y_path, vocab_sizes, indices):
        self.sequences_X = np.load(x_path, mmap_mode='r')
        self.sequences_y = np.load(y_path, mmap_mode='r')

        self.indices = indices

        self.vocab_size_cat = vocab_sizes['categories']
        self.vocab_size_merch = vocab_sizes['merchants']

        assert len(self.sequences_X) == len(self.sequences_y), "X and y sequences must have the same length."

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        x_item = np.copy(self.sequences_X[self.indices[idx]])
        y_item = np.copy(self.sequences_y[self.indices[idx]])

        category_id_in = x_item[:, 2]
        merchant_id_in = x_item[:, 3]

        if np.any(category_id_in >= self.vocab_size_cat) or np.any(merchant_id_in >= self.vocab_size_merch):
            raise IndexError(f"Data at index {self.indices[idx]} contains an out-of-bounds category or merchant ID.")

        return torch.from_numpy(x_item).float(), torch.from_numpy(y_item).float()

In [38]:
num_samples = np.load(sequences_X_path, mmap_mode='r').shape[0]
all_indices = np.arange(num_samples)
train_indices, val_indices = train_test_split(
    all_indices,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(f"Training indices: {len(train_indices)}")
print(f"Validation indices: {len(val_indices)}")

train_dataset = Dataset(sequences_X_path, sequences_y_path, vocab_sizes, train_indices)
val_dataset = Dataset(sequences_X_path, sequences_y_path, vocab_sizes, val_indices)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print(f"\nNumber of training batches: {len(train_dataloader)}")
print(f"Number of validation batches: {len(val_dataloader)}")

Training indices: 6020639
Validation indices: 1505160

Number of training batches: 2940
Number of validation batches: 735


# Pre-training

Architecture (LSTM)


In [19]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_cat, vocab_size_merch, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.category_embedding = nn.Embedding(vocab_size_cat, embedding_dim)
        self.merchant_embedding = nn.Embedding(vocab_size_merch, embedding_dim)

        self.lstm = nn.LSTM(
            input_size=12 + (embedding_dim * 2),
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )

    def forward(self, x):
        numerical_feats = x[:, :, [0,1,4,5,6,7,8,9,10,11,12,13]]
        cat_ids = x[:, :, 2].long()
        merch_ids = x[:, :, 3].long()

        cat_embeds = self.category_embedding(cat_ids)
        merch_embeds = self.merchant_embedding(merch_ids)

        lstm_input = torch.cat((numerical_feats, cat_embeds, merch_embeds), dim=2)

        _, (hidden, cell) = self.lstm(lstm_input)
        return hidden, cell

In [20]:
class Decoder(nn.Module):
    def __init__(self, vocab_size_cat, vocab_size_merch, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.vocab_size_cat = vocab_size_cat
        self.vocab_size_merch = vocab_size_merch


        self.category_embedding = nn.Embedding(vocab_size_cat, embedding_dim)
        self.merchant_embedding = nn.Embedding(vocab_size_merch, embedding_dim)

        self.lstm = nn.LSTM(
            input_size=1 + (embedding_dim * 2),
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )

        self.fc_amount = nn.Linear(hidden_dim, 1)
        self.fc_category = nn.Linear(hidden_dim, vocab_size_cat)
        self.fc_merchant = nn.Linear(hidden_dim, vocab_size_merch)

    def forward(self, x_t, hidden, cell):
        x_t = x_t.unsqueeze(1)

        numerical_feat = x_t[:, :, 0].unsqueeze(-1)
        cat_ids = x_t[:, :, 1].long()
        merch_ids = x_t[:, :, 2].long()

        cat_embeds = self.category_embedding(cat_ids)
        merch_embeds = self.merchant_embedding(merch_ids)

        lstm_input = torch.cat((numerical_feat, cat_embeds, merch_embeds), dim=2)

        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))

        pred_amount = self.fc_amount(output.squeeze(1))
        pred_category = self.fc_category(output.squeeze(1))
        pred_merchant = self.fc_merchant(output.squeeze(1))

        return pred_amount, pred_category, pred_merchant, hidden, cell

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        forecast_horizon = trg.shape[1]

        outputs_amount = torch.zeros(batch_size, forecast_horizon, 1).to(self.device)
        outputs_category = torch.zeros(batch_size, forecast_horizon, self.decoder.vocab_size_cat).to(self.device)
        outputs_merchant = torch.zeros(batch_size, forecast_horizon, self.decoder.vocab_size_merch).to(self.device)

        hidden, cell = self.encoder(src)

        decoder_input = trg[:, 0, :]

        for t in range(forecast_horizon):
            pred_amount, pred_category, pred_merchant, hidden, cell = self.decoder(decoder_input, hidden, cell)

            outputs_amount[:, t, :] = pred_amount
            outputs_category[:, t, :] = pred_category
            outputs_merchant[:, t, :] = pred_merchant

            teacher_force = random.random() < teacher_forcing_ratio

            if teacher_force:
                decoder_input = trg[:, t, :]
            else:
                top_category_id = pred_category.argmax(1).long()
                top_merchant_id = pred_merchant.argmax(1).long()

                pred_cat_embeds = self.decoder.category_embedding(top_category_id)
                pred_merch_embeds = self.decoder.merchant_embedding(top_merchant_id)

                decoder_input = torch.cat((pred_amount, top_category_id.unsqueeze(1).float(), top_merchant_id.unsqueeze(1).float()), dim=1)


        return outputs_amount, outputs_category, outputs_merchant

    def predict(self, src, forecast_horizon):
        self.eval()
        batch_size = src.shape[0]

        outputs_amount = torch.zeros(batch_size, forecast_horizon, 1).to(self.device)
        outputs_category = torch.zeros(batch_size, forecast_horizon, self.decoder.vocab_size_cat).to(self.device)
        outputs_merchant = torch.zeros(batch_size, forecast_horizon, self.decoder.vocab_size_merch).to(self.device)

        with torch.no_grad():
            hidden, cell = self.encoder(src)

            last_known_amount = src[:, -1, 0].unsqueeze(1)
            last_known_cat_id = src[:, -1, 2]
            last_known_merch_id = src[:, -1, 3]
            decoder_input = torch.cat([last_known_amount, last_known_cat_id.unsqueeze(1), last_known_merch_id.unsqueeze(1)], dim=1)

            for t in range(forecast_horizon):
                pred_amount, pred_category, pred_merchant, hidden, cell = self.decoder(decoder_input, hidden, cell)

                outputs_amount[:, t, :] = pred_amount
                outputs_category[:, t, :] = pred_category
                outputs_merchant[:, t, :] = pred_merchant

                # Create the input for the next time step from the current prediction
                top_category_id = pred_category.argmax(1).float()
                top_merchant_id = pred_merchant.argmax(1).float()
                decoder_input = torch.cat((pred_amount, top_category_id.unsqueeze(1), top_merchant_id.unsqueeze(1)), dim=1)

        return outputs_amount, outputs_category, outputs_merchant

In [58]:
def train_model(model, dataloader, optimizer, criterion_h, criterion_ce, device):
    model.train()
    epoch_loss = 0


    for i, (X_batch, y_batch) in enumerate(dataloader):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()

        pred_amount, pred_category, pred_merchant = model(X_batch, y_batch)

        loss_amount = criterion_h(pred_amount.squeeze(-1), y_batch[:, :, 0])
        loss_category = criterion_ce(
            pred_category.view(-1, pred_category.shape[-1]),
            y_batch[:, :, 1].view(-1).long()
        )
        loss_merchant = criterion_ce(
            pred_merchant.view(-1, pred_merchant.shape[-1]),
            y_batch[:, :, 2].view(-1).long()
        )

        total_loss = loss_amount * 0.0 + loss_category * 0.5 + loss_merchant * 0.4

        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
        optimizer.step()

        epoch_loss += total_loss.item()

        print(f"\rEpoch {epoch+1}/{NUM_EPOCHS} | Batch {int((i+1)/len(dataloader)*100)}% | Loss: {total_loss.item():.4f}", end="")

    avg_loss = epoch_loss / len(dataloader)

    return avg_loss

In [23]:
def save_model(model_state_dict, epoch, bucket_name):
    local_save_dir = Path('model')
    local_save_dir.mkdir(parents=True, exist_ok=True)
    local_model_path = local_save_dir / f'model_epoch_{epoch+1}.pth'
    torch.save(model_state_dict, local_model_path)
    print(f"Model saved locally to '{local_model_path}'")

    gcs_model_path = f'gs://{bucket_name}/models/model_epoch_{epoch+1}.pth'
    print(f"Uploading model to GCS: {gcs_model_path}")
    !gcloud storage cp {local_model_path} {gcs_model_path}
    print("Upload complete.")

In [59]:
def validate_model(model, dataloader, criterion_h, criterion_ce, device):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch, y_batch, teacher_forcing_ratio=0)

            amount_pred, category_pred, merchant_pred = outputs
            amount_true, category_true, merchant_true = y_batch[:, :, 0], y_batch[:, :, 1].long(), y_batch[:, :, 2].long()

            loss_h = criterion_h(amount_pred.squeeze(-1), amount_true)
            loss_cat = criterion_ce(category_pred.view(-1, VOCAB_SIZE_CAT), category_true.view(-1))
            loss_merch = criterion_ce(merchant_pred.view(-1, VOCAB_SIZE_MERCH), merchant_true.view(-1))

            combined_loss = loss_h *0.0 + loss_cat * 0.5 + loss_merch *0.4
            total_loss += combined_loss.item()

    return total_loss / len(dataloader)

In [41]:
VOCAB_SIZE_CAT = vocab_sizes['categories']
VOCAB_SIZE_MERCH = vocab_sizes['merchants']

encoder = Encoder(VOCAB_SIZE_CAT, VOCAB_SIZE_MERCH, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT_PROB)
decoder = Decoder(VOCAB_SIZE_CAT, VOCAB_SIZE_MERCH, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT_PROB)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=2,
    threshold=0.01
)

criterion_h = nn.HuberLoss()
criterion_ce = nn.CrossEntropyLoss()

best_val_loss = float('inf')
epochs_no_improve = 0
patience = 3

print(f"\n--- Starting Training for {NUM_EPOCHS} Epochs ---")
for epoch in range(NUM_EPOCHS):
    avg_loss = train_model(model, train_dataloader, optimizer, criterion_h, criterion_ce, DEVICE)
    avg_val_loss = validate_model(model, val_dataloader, criterion_h, criterion_ce, DEVICE)
    scheduler.step(avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} -> Average Validation Loss: {avg_val_loss:.4f}")
    save_model(model.state_dict(), epoch, bucket_name)

    if epochs_no_improve >= patience:
        print(f"\n--- Early stopping triggered after {epoch+1} epochs. ---")
        break

print(f"Loading best model from epoch with validation loss: {best_val_loss:.4f}")



--- Starting Training for 10 Epochs ---
Epoch 1/10 | Batch 100% | Loss: 5.1980Epoch 1/10 -> Average Validation Loss: 5.4603
Model saved locally to 'model/model_epoch_1.pth'
Uploading model to GCS: gs://bernhackt/models/model_epoch_1.pth
Copying file://model/model_epoch_1.pth to gs://bernhackt/models/model_epoch_1.pth

Average throughput: 12.8MiB/s
Upload complete.
Epoch 2/10 | Batch 100% | Loss: 4.7094Epoch 2/10 -> Average Validation Loss: 5.2650
Model saved locally to 'model/model_epoch_2.pth'
Uploading model to GCS: gs://bernhackt/models/model_epoch_2.pth
Copying file://model/model_epoch_2.pth to gs://bernhackt/models/model_epoch_2.pth

Average throughput: 12.5MiB/s
Upload complete.
Epoch 3/10 | Batch 100% | Loss: 4.4337Epoch 3/10 -> Average Validation Loss: 5.1428
Model saved locally to 'model/model_epoch_3.pth'
Uploading model to GCS: gs://bernhackt/models/model_epoch_3.pth
Copying file://model/model_epoch_3.pth to gs://bernhackt/models/model_epoch_3.pth

Average throughput: 12.3M

In [60]:
for epoch in range(NUM_EPOCHS, 2 * NUM_EPOCHS):
    avg_loss = train_model(model, train_dataloader, optimizer, criterion_h, criterion_ce, DEVICE)
    avg_val_loss = validate_model(model, val_dataloader, criterion_h, criterion_ce, DEVICE)
    scheduler.step(avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} -> Average Validation Loss: {avg_val_loss:.4f}")
    save_model(model.state_dict(), epoch, bucket_name)

    if epochs_no_improve >= patience:
        print(f"\n--- Early stopping triggered after {epoch+1} epochs. ---")
        break

Epoch 11/10 | Batch 100% | Loss: 1.9467Epoch 11/10 -> Average Validation Loss: 2.1270
Model saved locally to 'model/model_epoch_11.pth'
Uploading model to GCS: gs://bernhackt/models/model_epoch_11.pth
Copying file://model/model_epoch_11.pth to gs://bernhackt/models/model_epoch_11.pth

Average throughput: 12.5MiB/s
Upload complete.
Epoch 12/10 | Batch 100% | Loss: 1.9315Epoch 12/10 -> Average Validation Loss: 2.1234
Model saved locally to 'model/model_epoch_12.pth'
Uploading model to GCS: gs://bernhackt/models/model_epoch_12.pth
Copying file://model/model_epoch_12.pth to gs://bernhackt/models/model_epoch_12.pth

Average throughput: 13.2MiB/s
Upload complete.
Epoch 13/10 | Batch 35% | Loss: 1.9147

KeyboardInterrupt: 

# Inference

In [61]:
def display_sequence(sequence_np, scaler, idx2cat, idx2merch, is_target=False):
    print("\n--- INPUT SEQUENCE (CONTEXT) ---" if not is_target else "\n--- ACTUAL TRANSACTIONS (FOR COMPARISON) ---")
    num_numerical_features = len(scaler.scale_)

    for i, step in enumerate(sequence_np):
        if not is_target:
            numerical_features_normalized = step[[0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
            unscaled_numerical = scaler.inverse_transform(numerical_features_normalized.reshape(1, -1))[0]
            real_amount = unscaled_numerical[0]
            time_delta = unscaled_numerical[2]
            cat_id_norm = int(round(step[2]))
            merch_id_norm = int(round(step[3]))
        else:
            dummy_scaled_features = np.zeros((1, num_numerical_features))
            amount_index_in_scaler = 0
            time_delta_index_in_scaler = 2

            dummy_scaled_features[0, amount_index_in_scaler] = step[0]
            dummy_scaled_features[0, time_delta_index_in_scaler] = step[3]

            unscaled_features = scaler.inverse_transform(dummy_scaled_features)[0]
            real_amount = unscaled_features[amount_index_in_scaler]
            time_delta = unscaled_features[time_delta_index_in_scaler]

            cat_id_norm = int(round(step[1]))
            merch_id_norm = int(round(step[2]))


        category_name = idx2cat.get(cat_id_norm, "Unknown")
        merchant_name = idx2merch.get(merch_id_norm, "Unknown")

        print(f"Step {i+1:02d}: Amount: ${real_amount:10.2f} | Category: {category_name:<25} | Merchant: {merchant_name} | Time Delta: {time_delta:.2f}")
    print("-" * 70)

In [43]:
def predict_next_transactions(input_tensor, model, scaler, idx2cat, idx2merch, device, forecast_horizon):
    amount_preds, category_preds, merchant_preds = model.predict(input_tensor, forecast_horizon)

    predictions = []
    num_numerical_features = len(scaler.scale_)

    for i in range(forecast_horizon):
        norm_amount_val = amount_preds[:, i, :].item()
        dummy_row = np.zeros((1, num_numerical_features))
        dummy_row[0, 0] = norm_amount_val
        real_amount = scaler.inverse_transform(dummy_row)[0, 0]

        cat_idx = torch.argmax(category_preds[:, i, :], dim=1).item()
        category_name = idx2cat.get(cat_idx, "Unknown")
        merch_idx = torch.argmax(merchant_preds[:, i, :], dim=1).item()
        merchant_name = idx2merch.get(merch_idx, "Unknown")

        predictions.append(
            f"  - Step {i+1}: Amount: ${real_amount:.2f}, Category: '{category_name}', Merchant: '{merchant_name}'"
        )
    return "\n".join(predictions)

In [75]:
random_val_index = random.choice(val_indices)

input_sequence_np = np.load(sequences_X_path, mmap_mode='r')[random_val_index]
target_sequence_np = np.load(sequences_y_path, mmap_mode='r')[random_val_index]

input_tensor = torch.from_numpy(input_sequence_np).float().unsqueeze(0).to(DEVICE)

idx2cat = {i: name for i, name in enumerate(vocab_mappings['categories'])}
idx2merch = {i: name for i, name in enumerate(vocab_mappings['merchants'])}

display_sequence(input_sequence_np, scaler, idx2cat, idx2merch, is_target=False)

print(f"\n--- PREDICTED TRANSACTIONS (FORECAST HORIZON: {FORECAST_HORIZON}) ---")
predicted_transactions = predict_next_transactions(
    input_tensor,
    model,
    scaler,
    idx2cat,
    idx2merch,
    DEVICE,
    FORECAST_HORIZON
)
print(predicted_transactions)
print("-" * 70)

display_sequence(target_sequence_np, scaler, idx2cat, idx2merch, is_target=True)


--- INPUT SEQUENCE (CONTEXT) ---
Step 01: Amount: $   -250.00 | Category: HEALTHCARE_GENERAL        | Merchant: CSS | Time Delta: 2.00
Step 02: Amount: $   -124.45 | Category: GENERAL_EXPENSES_DAILY    | Merchant: Coop | Time Delta: 1.00
Step 03: Amount: $    -62.99 | Category: P2P_TRANSFER              | Merchant: Twint Split Bill | Time Delta: -0.00
Step 04: Amount: $   5320.99 | Category: INCOME_GENERAL            | Merchant: Local Bank Salary | Time Delta: 3.00
Step 05: Amount: $     -9.75 | Category: BANK_FEE                  | Merchant: Local Bank Bank Fee | Time Delta: 1.00
Step 06: Amount: $   -127.81 | Category: INSURANCE                 | Merchant: Swica Insurance | Time Delta: -0.00
Step 07: Amount: $   -160.00 | Category: SHOPPING_JEWELRY          | Merchant: Swiss Company | Time Delta: -0.00
Step 08: Amount: $    -28.35 | Category: GENERAL_EXPENSES_DAILY    | Merchant: Volg | Time Delta: 1.00
Step 09: Amount: $    -72.90 | Category: SHOPPING_BOOKS            | Merchant: O