In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch
import torch.nn as nn
import torch.optim as optim

pd.set_option('display.float_format', lambda x: '%.3f' % x)


In [2]:

# Load the data from a CSV file
df = pd.read_csv('../data/preprocessed_v1.csv')


usable_columns = ['day', 'month', 'quarter', 'is_weekend', 'day_of_week', 'scaled_amount', 'log_amount']


data_tensor = torch.tensor(df[usable_columns].values, dtype=torch.float)

In [3]:
import torch.nn as nn

class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(7, 4),  # Encode_1: 8 -> 4
            nn.Tanh(),
            nn.Linear(4, 2),  # Encode_2: 4 -> 2
        )
        
        # Mean and standard deviation of the latent distribution
        self.mean_linear = nn.Linear(2, 1)
        self.log_var_linear = nn.Linear(2, 1)

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(1, 2),  # Decode_1: 1 -> 2
            nn.Tanh(),
            nn.Linear(2, 4),  # Decode_2: 2 -> 4
            nn.ReLU(),
            nn.Linear(4, 7),  # Decode_3: 4 -> 8
            nn.LeakyReLU()
        )
    def forward(self, x):
        # Encode the input data
        encoded = self.encoder(x)

        # Compute the mean and standard deviation of the latent distribution
        mean = self.mean_linear(encoded)
        log_var = self.log_var_linear(encoded)

        # Sample from the latent distribution
        std = torch.exp(0.5 * log_var)
        z = torch.normal(mean, std)

        # Decode the latent representation
        decoded = self.decoder(z)
        
        return decoded, mean, log_var

In [4]:
from sklearn.model_selection import train_test_split

df.sort_values(by='createdAt', ascending=True)

# Time-based train-test split
train_size = int(len(df) * 0.75)

X = df[usable_columns]
y = df['reported']

train_data = df[usable_columns].iloc[:train_size]
test_data = df[usable_columns].iloc[train_size:]

train_labels = df['reported'].iloc[:train_size]
test_labels = df['reported'].iloc[train_size:]

# Filter training data to only include normal transactions
normal_train_data = train_data[train_labels == 0]  # Assuming 0 means normal


train_tensor = torch.tensor(normal_train_data.values, dtype=torch.float32)
test_tensor = torch.tensor(test_data.values, dtype=torch.float32)




In [5]:
batch_size = 200
train_dataset = torch.utils.data.TensorDataset(train_tensor)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(test_tensor)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


lambda_reg = 1e-3

# Model initialization
model = VAE()

# Choice of loss
loss_type = 'MSE'  # Choose between 'MSE' and 'BCE'
if loss_type == 'MSE':
    reconstruction_loss = nn.MSELoss()  # Mean squared error loss
else:
    reconstruction_loss = nn.BCEWithLogitsLoss()  # Binary cross-

In [6]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping setup
best_loss = float('inf')
patience = 10
epochs_without_improvement = 0

# Training loop with early stopping
num_epochs = 100

loss_values = []
test_loss_values = []
reconstruction_errors = []


for epoch in range(num_epochs):
    reconstruction_errors = []
    epoch_loss = 0.0
    for batch in train_dataloader:
        inputs = batch[0]

        # Zero the optimizer's gradient buffer
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        mse_loss = reconstruction_loss(outputs, inputs)
        
        # 3. Activity Regularizer
        # Assuming the first layer of the encoder is model.encoder[0]
        activity_regularizer = lambda_reg * torch.norm(model.encoder[0](inputs), 1)
        
        # Combining the regularized loss
        total_loss = mse_loss + activity_regularizer

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        epoch_loss += total_loss.item()


    epoch_loss = epoch_loss / len(train_dataloader)

    # Save the loss value for plotting
    loss_values.append(epoch_loss)

    # Test the model
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for batch in test_dataloader:
            inputs = batch[0]
            outputs = model(inputs)
            mse_loss = reconstruction_loss(outputs, inputs)
            test_loss += mse_loss.item()

            batch_errors = torch.sum((outputs - inputs) ** 2, dim=1).numpy() # Sum of squared differences for each sample
            reconstruction_errors.extend(batch_errors)

        test_loss = test_loss / len(test_dataloader)
        test_loss_values.append(test_loss)

    model.train()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Test Loss: {test_loss:.4f}")


    # Early stopping check
    if epoch_loss + 1e-5 < best_loss:
        best_loss = epoch_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement == patience:
        print("Early stopping due to no improvement in loss.")
        break


AttributeError: 'tuple' object has no attribute 'size'