In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
import pandas as pd
df = pd.read_csv("datathon_data.csv")  # Update with your file path

# Define features
categorical_features = ['WAERS', 'BUKRS', 'KTOSL', 'PRCTR', 'BSCHL', 'HKONT']
numerical_features = ['DMBTR', 'WRBTR']

# Encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Scale numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Convert to tensor
transaction_tensor = torch.tensor(df[numerical_features + categorical_features].values, dtype=torch.float32)

# Create DataLoader
batch_size = 512
dataset = TensorDataset(transaction_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [2]:
# Define Autoencoder Model
class TransactionAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=4):
        super(TransactionAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Initialize Model
input_dim = transaction_tensor.shape[1]
latent_dim = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
autoencoder = TransactionAutoencoder(input_dim, latent_dim).to(device)


In [3]:

# Define Optimizer and Loss Function
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
loss_function = nn.MSELoss()

# Train Autoencoder
num_epochs = 20
loss_history = []

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in dataloader:
        batch_data = batch[0].to(device)

        optimizer.zero_grad()
        encoded, decoded = autoencoder(batch_data)
        loss = loss_function(decoded, batch_data)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    loss_history.append(avg_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.6f}")




Epoch [1/20], Loss: 32.197238
Epoch [2/20], Loss: 0.443333
Epoch [3/20], Loss: 0.292280
Epoch [4/20], Loss: 0.270495
Epoch [5/20], Loss: 0.251720
Epoch [6/20], Loss: 0.231855
Epoch [7/20], Loss: 0.216271
Epoch [8/20], Loss: 0.201184
Epoch [9/20], Loss: 0.190124
Epoch [10/20], Loss: 0.184369
Epoch [11/20], Loss: 0.179234
Epoch [12/20], Loss: 0.172729
Epoch [13/20], Loss: 0.163361
Epoch [14/20], Loss: 0.157597
Epoch [15/20], Loss: 0.153068
Epoch [16/20], Loss: 0.149302
Epoch [17/20], Loss: 0.146688
Epoch [18/20], Loss: 0.144455
Epoch [19/20], Loss: 0.141950
Epoch [20/20], Loss: 0.141141


In [4]:
# Extract embeddings
with torch.no_grad():
    transaction_embeddings = autoencoder.encoder(transaction_tensor.to(device)).cpu().numpy()

# Save embeddings
np.save("transaction_embeddings.npy", transaction_embeddings)

print("Embeddings saved successfully!")

Embeddings saved successfully!
