# Training Autoencoder

We need autoencoder to have representative embeddings of transactions data.

We later use them in vector database.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from model import TransactionAutoencoder, RefinedTransactionAutoencoder
import pandas as pd


df = pd.read_csv("data/datathon_data.csv")  # Update with your file path

# Define features
categorical_features = ['WAERS', 'BUKRS', 'KTOSL', 'PRCTR', 'BSCHL', 'HKONT']
numerical_features = ['DMBTR', 'WRBTR']

# Encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Scale numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Convert to tensor
transaction_tensor = torch.tensor(df[numerical_features + categorical_features].values, dtype=torch.float32)

# Create DataLoader
batch_size = 256
dataset = TensorDataset(transaction_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [2]:
# Initialize Model
input_dim = transaction_tensor.shape[1]
latent_dim = 8

metadata = {
    "input_dim": input_dim,
    "latent_dim": latent_dim
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
autoencoder = RefinedTransactionAutoencoder(input_dim, latent_dim).to(device)

In [3]:
# Define Optimizer and Loss Function
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
loss_function = nn.MSELoss()

# Train Autoencoder
num_epochs = 20
loss_history = []

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in dataloader:
        batch_data = batch[0].to(device)

        optimizer.zero_grad()
        encoded, decoded = autoencoder(batch_data)
        loss = loss_function(decoded, batch_data)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    loss_history.append(avg_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.6f}")


Epoch [1/20], Loss: 66.917638
Epoch [2/20], Loss: 1.574322
Epoch [3/20], Loss: 1.338837
Epoch [4/20], Loss: 1.301682
Epoch [5/20], Loss: 1.203743
Epoch [6/20], Loss: 1.181005
Epoch [7/20], Loss: 1.137032
Epoch [8/20], Loss: 1.111493
Epoch [9/20], Loss: 1.093801
Epoch [10/20], Loss: 1.063889
Epoch [11/20], Loss: 1.028603
Epoch [12/20], Loss: 1.010198
Epoch [13/20], Loss: 0.998487
Epoch [14/20], Loss: 0.988024
Epoch [15/20], Loss: 0.987617
Epoch [16/20], Loss: 0.980257
Epoch [17/20], Loss: 0.996732
Epoch [18/20], Loss: 0.955008
Epoch [19/20], Loss: 0.985011
Epoch [20/20], Loss: 0.965712


In [4]:
# Extract embeddings
with torch.no_grad():
    transaction_embeddings = autoencoder.encoder(transaction_tensor.to(device)).cpu().numpy()

# Save embeddings
np.save("weights/refined_transaction_embeddings.npy", transaction_embeddings)
torch.save({"model_state": autoencoder.state_dict(), "metadata": metadata}, "weights/refined_autoencoder_with_metadata.pth")
print("Embeddings saved successfully!")

Embeddings saved successfully!


In [3]:
# Load metadata first
checkpoint = torch.load("weights/refined_autoencoder_with_metadata.pth")
metadata = checkpoint["metadata"]

# Ensure correct architecture
autoencoder = RefinedTransactionAutoencoder(input_dim=metadata["input_dim"], latent_dim=metadata["latent_dim"])

# Load model weights
autoencoder.load_state_dict(checkpoint["model_state"])
autoencoder.eval()

print("✅ Model and metadata loaded successfully.")

✅ Model and metadata loaded successfully.


  checkpoint = torch.load("weights/refined_autoencoder_with_metadata.pth")
