In [1]:
import pandas as pd

In [5]:
import torch
print(torch.__version__)
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

2.8.0+cu126


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


df = pd.read_csv("financial_anomaly_data.csv")
df["Timestamp"] = pd.to_datetime(
    df["Timestamp"],
    format="%d-%m-%Y %H:%M"
)
df["hour"] = df["Timestamp"].dt.hour
df["dayofweek"] = df["Timestamp"].dt.dayofweek
df = df.drop(columns=["TransactionID", "Timestamp"])

numeric_features = ["Amount", "hour", "dayofweek"]
categorical_features = ["AccountID", "Merchant", "TransactionType", "Location"]

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train = preprocessor.fit_transform(train_df)
X_test = preprocessor.transform(test_df)

import numpy as np

if hasattr(X_train, "toarray"):
    X_train = X_train.toarray()
    X_test = X_test.toarray()

X_train = X_train.astype("float32")
X_test = X_test.astype("float32")

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (173952, 36)
Test shape: (43489, 36)


In [6]:
class TabularDataset(Dataset):
    def __init__(self, X):
        self.X = torch.from_numpy(X).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx]

train_dataset = TabularDataset(X_train)
test_dataset = TabularDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)


In [8]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
        )

        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

input_dim = X_train.shape[1]
device = torch.device("cpu")
model = Autoencoder(input_dim)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [9]:
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch in train_loader:
        batch = batch.to(device)

        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch)  # reconstruct input
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.6f}")

Epoch 1/50, Loss: 0.066028
Epoch 2/50, Loss: 0.032519
Epoch 3/50, Loss: 0.028168
Epoch 4/50, Loss: 0.026993
Epoch 5/50, Loss: 0.026375
Epoch 6/50, Loss: 0.026082
Epoch 7/50, Loss: 0.025923
Epoch 8/50, Loss: 0.024694
Epoch 9/50, Loss: 0.022739
Epoch 10/50, Loss: 0.021222
Epoch 11/50, Loss: 0.020266
Epoch 12/50, Loss: 0.019947
Epoch 13/50, Loss: 0.019805
Epoch 14/50, Loss: 0.019776
Epoch 15/50, Loss: 0.019719
Epoch 16/50, Loss: 0.019706
Epoch 17/50, Loss: 0.019677
Epoch 18/50, Loss: 0.019676
Epoch 19/50, Loss: 0.019660
Epoch 20/50, Loss: 0.019633
Epoch 21/50, Loss: 0.019619
Epoch 22/50, Loss: 0.019596
Epoch 23/50, Loss: 0.019605
Epoch 24/50, Loss: 0.019567
Epoch 25/50, Loss: 0.019588
Epoch 26/50, Loss: 0.019552
Epoch 27/50, Loss: 0.019546
Epoch 28/50, Loss: 0.019532
Epoch 29/50, Loss: 0.019533
Epoch 30/50, Loss: 0.019524
Epoch 31/50, Loss: 0.019516
Epoch 32/50, Loss: 0.019497
Epoch 33/50, Loss: 0.019522
Epoch 34/50, Loss: 0.019480
Epoch 35/50, Loss: 0.019494
Epoch 36/50, Loss: 0.019485
E

In [10]:
model.eval()
with torch.no_grad():
    X_train_tensor = torch.from_numpy(X_train).to(device)
    recon_train = model(X_train_tensor)
    train_errors = torch.mean((X_train_tensor - recon_train) ** 2, dim=1)

threshold = train_errors.mean() + 3 * train_errors.std()
threshold = threshold.item()
print("Threshold:", threshold)


Threshold: 0.048860736191272736


In [11]:
with torch.no_grad():
    X_test_tensor = torch.from_numpy(X_test).to(device)
    recon_test = model(X_test_tensor)
    test_errors = torch.mean((X_test_tensor - recon_test) ** 2, dim=1)

test_errors_cpu = test_errors.cpu().numpy()

In [12]:
test_df_anom = test_df.copy()
test_df_anom["recon_error"] = test_errors_cpu
test_df_anom["is_anomaly"] = (test_df_anom["recon_error"] > threshold).astype(int)

top_anomalies = test_df_anom.sort_values("recon_error", ascending=False).head(20)
print(top_anomalies)

       AccountID     Amount   Merchant TransactionType       Location  hour  \
213460      ACC5  689504.90  MerchantI        Purchase  San Francisco  13.0   
3448       ACC12  712076.97  MerchantA      Withdrawal          Tokyo  17.0   
211709      ACC1   65245.44  MerchantE        Transfer          Tokyo   8.0   
30329       ACC1   47852.25  MerchantE        Transfer          Tokyo   9.0   
38847       ACC1   48348.52  MerchantE        Transfer          Tokyo   7.0   
161837      ACC1    3693.67  MerchantE        Transfer          Tokyo  17.0   
8655        ACC1   66417.17  MerchantE        Transfer          Tokyo   8.0   
210555      ACC1   36803.36  MerchantE        Transfer          Tokyo  13.0   
99632       ACC1   80071.45  MerchantE        Transfer          Tokyo  12.0   
166886      ACC1     977.95  MerchantE        Transfer          Tokyo   5.0   
140274      ACC1   70281.01  MerchantE        Transfer          Tokyo  17.0   
69795       ACC1   82657.62  MerchantE        Transf