# Imports

In [104]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.dataset import random_split
import torchvision.transforms as transforms
import time

# Data Collecting & vizualisation

In [105]:
# Define a custom dataset class for PyTorch
class SupplierDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]

        # Extract features
        labels = ['id', 'status']
        features = sample[labels].values # ignore name and address for now

        # Apply transformations (e.g., convert strings/categories to numerical values)
        if self.transform:
            features = self.transform(features, labels).astype(np.float32)
        # Convert to PyTorch tensor
        features = torch.tensor(features, dtype=torch.float32)

        return features, 0 # 0 is a dummy label

In [106]:
def transform_categorical(data, labels):
    # Convert categorical variables to numerical values (you can use more advanced encoding methods)
    status_mapping = {'draft': 0, 'val': 1, 'other': 2}
    status_index = labels.index('status')
    data[status_index] = status_mapping[data[status_index]]

    # You can implement similar transformations for other categorical variables

    return data

In [107]:
# Load the csv dataset from the csv file
dataset_path = 'datasets/fake_supplier_3.csv'
# Creating a DataFrame from the CSV data (replace this with your actual CSV file path)
df = pd.read_csv(dataset_path)
# Create an instance of the SupplierDataset with the specified transformations
supplier_dataset = SupplierDataset(dataframe=df, transform=transform_categorical)

for i in range(5):
    sample = supplier_dataset[i]
    print(f"supplier Sample {i + 1}:", sample)

supplier Sample 1: (tensor([1., 0.]), 0)
supplier Sample 2: (tensor([2., 1.]), 0)
supplier Sample 3: (tensor([3., 1.]), 0)
supplier Sample 4: (tensor([4., 0.]), 0)
supplier Sample 5: (tensor([5., 1.]), 0)


# Split the dataset into train and test sets

In [108]:
train_percentage = 0.2
batch_size = 2

# Split the dataset into training and validation sets
train_size = int(0.8 * len(supplier_dataset))
val_size = len(supplier_dataset) - train_size
train_dataset, val_dataset = random_split(supplier_dataset, [train_size, val_size])

# Create DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classes and functions needed

In [109]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()

        # Encoder layers
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2_mean = nn.Linear(512, latent_dim)
        self.fc2_logvar = nn.Linear(512, latent_dim)

        # Decoder layers
        self.fc3 = nn.Linear(latent_dim, 512)
        self.fc4 = nn.Linear(512, input_dim)

    def encode(self, x):
        # Encoder forward pass
        x = F.relu(self.fc1(x))
        mean = self.fc2_mean(x)
        logvar = self.fc2_logvar(x)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        # Reparameterization trick for sampling from a normal distribution
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def decode(self, z):
        # Decoder forward pass
        z = F.relu(self.fc3(z))
        recon_x = torch.sigmoid(self.fc4(z))  # Assuming input features are normalized between 0 and 1
        return recon_x

    def forward(self, x):
        # Full forward pass of the VAE
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        recon_x = self.decode(z)
        return recon_x, mean, logvar

In [110]:
def loss_function(reconstructed_output, input_tensor, mu, log_var):
    print("reconstructed_output", reconstructed_output)
    print("input_tensor", input_tensor)
    BCE = nn.functional.mse_loss(reconstructed_output, input_tensor, reduction='sum')
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return BCE + KLD

In [111]:
def train(model, optimizer, train_dataset, epochs, device, x_dim=-1):
    model.train()
    startTotal = time.time()
    for epoch in range(epochs):
        print("\tEpisode", epoch + 1, "/", epochs)
        overall_loss = 0
        start = time.time()
        for batch_idx, (input_tensor, _) in enumerate(train_dataset):
            print("\t\tBatch", batch_idx + 1, "/", len(train_dataset))
            input_tensor = input_tensor.view(batch_size, x_dim).to(device)

            optimizer.zero_grad()

            reconstructed_output, mean, log_var = model(input_tensor)
            loss = loss_function(reconstructed_output, input_tensor, mean, log_var)
            
            overall_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        end = time.time()
        print("\tEpisode Result", "\tAverage Loss: ", overall_loss/(batch_idx*batch_size), "\tStep Time: ", end - start, "s", "\tTotal Time: ", end - startTotal, "s","\n\n")
    return overall_loss

# Create model and train it

In [112]:
# create model and optimizer
input_dim = 2 # corresponds to the number of features in the dataset
latent_dim = 2 # corresponds to the number of latent variables
model = VAE(input_dim, latent_dim).to(device)
optimizer = Adam(model.parameters(), lr=1e-2)

In [113]:
# train model
train(model, optimizer, train_loader, epochs=500, device=device)

	Episode 1 / 500
		Batch 1 / 2
reconstructed_output tensor([[0.7389, 0.4600],
        [0.7257, 0.4972]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[2., 1.],
        [6., 1.]])
		Batch 2 / 2
reconstructed_output tensor([[0.5143, 0.8420],
        [0.4013, 0.8643]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[3., 1.],
        [4., 0.]])
	Episode Result 	Average Loss:  38.782981872558594 	Step Time:  0.010970592498779297 s 	Total Time:  0.010970592498779297 s 


	Episode 2 / 500
		Batch 1 / 2
reconstructed_output tensor([[0.8194, 0.5663],
        [0.8748, 0.5585]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[3., 1.],
        [2., 1.]])
		Batch 2 / 2
reconstructed_output tensor([[0.9996, 0.9976],
        [0.9960, 0.9833]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[6., 1.],
        [4., 0.]])
	Episode Result 	Average Loss:  27.384371757507324 	Step Time:  0.008971214294433594 s 	Total Time:  0.02097344398498535 s 


	Episode 3 / 500
		Batch 1 / 2
reconstructed_outp

		Batch 2 / 2
reconstructed_output tensor([[1.0000, 0.9999],
        [1.0000, 0.9911]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[4., 0.],
        [2., 1.]])
	Episode Result 	Average Loss:  20.00180435180664 	Step Time:  0.006979703903198242 s 	Total Time:  0.34407711029052734 s 


	Episode 45 / 500
		Batch 1 / 2
reconstructed_output tensor([[0.9999, 0.9917],
        [1.0000, 0.9858]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[2., 1.],
        [3., 1.]])
		Batch 2 / 2
reconstructed_output tensor([[0.9999, 0.9921],
        [0.9997, 0.9793]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[4., 0.],
        [6., 1.]])
	Episode Result 	Average Loss:  19.99622631072998 	Step Time:  0.00797891616821289 s 	Total Time:  0.35205602645874023 s 


	Episode 46 / 500
		Batch 1 / 2
reconstructed_output tensor([[0.9999, 0.9791],
        [0.9999, 0.9948]], grad_fn=<SigmoidBackward0>)
input_tensor tensor([[6., 1.],
        [2., 1.]])
		Batch 2 / 2
reconstructed_output tensor([[1.0000,

40.000163078308105

In [114]:
# Generate a new sample
def generate_sample(model, latent_dim):
    sample = torch.randn(1, latent_dim)
    return model.decode(sample).detach().numpy()

# Print or use the generated supplier data as needed
print("Generated Supplier Data:")
for i in range(5):
    print(generate_sample(model, latent_dim))

Generated Supplier Data:
[[1.         0.99995494]]
[[0.9999995  0.99999976]]
[[0.99999845 0.99999344]]
[[1. 1.]]
[[1. 1.]]
