# Imports

In [10]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
import torchvision.transforms as transforms
import time

# Data Collecting & vizualisation

In [11]:
# Define a custom dataset class for PyTorch
class SupplierDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]

        # Extract features
        labels = ['id', 'status']
        features = sample[labels].values # ignore name and address for now

        # Apply transformations (e.g., convert strings/categories to numerical values)
        if self.transform:
            features = self.transform(features, labels).astype(np.float32)
        # Convert to PyTorch tensor
        features = torch.tensor(features, dtype=torch.float32)

        return features, 0 # 0 is a dummy label

In [None]:
def is_date_column(dataset, column):
    # check in the column rows if there is a date
    for i in range(len(dataset)):
        if dataset[i][column] != 0:
            return True


In [21]:
def separate_columns_types(dataset):
    # Separate the text columns from the other columns
    text_columns = []
    date_columns = []
    categorical_columns = []
    numerical_columns = []

    for column in dataset.columns:
        if dataset[column].dtype == 'object':
            text_columns.append(column)
        elif dataset[column].dtype == 'datetime64[ns]':
            date_columns.append(column)
        elif dataset[column].dtype == 'category':
            categorical_columns.append(column)
        else:
            numerical_columns.append(column)
            
    return dataset[text_columns], dataset[date_columns], dataset[categorical_columns], dataset[numerical_columns]

In [16]:
def transform_categorical(data, labels):
    # Convert categorical variables to numerical values (you can use more advanced encoding methods)
    status_mapping = {'draft': 0, 'val': 1, 'other': 2}
    status_index = labels.index('status')
    data[status_index] = status_mapping[data[status_index]]

    # You can implement similar transformations for other categorical variables

    return data

In [22]:
# Load the csv dataset from the csv file
dataset_path = 'datasets/fake_invoice_1000.csv'
# Creating a DataFrame from the CSV data (replace this with your actual CSV file path)
df = pd.read_csv(dataset_path)

# Process the dataset before creating the SupplierDataset
text_columns, date_columns, categorical_columns, numerical_columns = separate_columns_types(df)
print("Text columns:\n", text_columns.head())
print("Date columns:\n", date_columns.head())
print("Categorical columns:\n", categorical_columns.head())
print("Numerical columns:\n", numerical_columns.head())

# Create an instance of the SupplierDataset with the specified transformations
supplier_dataset = SupplierDataset(dataframe=df, transform=transform_categorical)

for i in range(5):
    sample = supplier_dataset[i]
    print(f"supplier Sample {i + 1}:", sample)

Text columns:
   invoice_code invoice_date      customer_name               customer_email  \
0          j8*   10/24/2022  Sidoney Vasechkin  ycathrall0@wunderground.com   
1          xP*    4/30/2022       Tore Ledwith       lhellens1@blogspot.com   
2          fV*    7/20/2022    Reggy Godlonton  lyantsev2@timesonline.co.uk   
3          rC*   12/24/2022     Ermin Rowswell           lberndtssen3@ow.ly   
4          E_*     9/6/2022  Juliet Applegarth      lchadbourn4@cornell.edu   

    customer_address customer_city customer_state customer_postal_code  \
0  139 Old Gate Park         Pacho            NaN               254008   
1  76491 Acker Plaza       Ilandža            NaN                  NaN   
2  7 Stoughton Point        Cluses             B9          74311 CEDEX   
3  00 Warrior Street       Neepawa             MB                  H3Z   
4      92 Debs Drive      Nusajaya            JHR                79632   

  customer_country payment_method  ...  updated_at    due_date   

KeyError: "['id'] not in index"

# Split the dataset into train and test sets

In [None]:
train_percentage = 0.2
batch_size = 2

# Split the dataset into training and validation sets
train_size = int(0.8 * len(supplier_dataset))
val_size = len(supplier_dataset) - train_size
train_dataset, val_dataset = random_split(supplier_dataset, [train_size, val_size])

# Create DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classes and functions needed

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()

        # Encoder layers
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2_mean = nn.Linear(512, latent_dim)
        self.fc2_logvar = nn.Linear(512, latent_dim)

        # Decoder layers
        self.fc3 = nn.Linear(latent_dim, 512)
        self.fc4 = nn.Linear(512, input_dim)

    def encode(self, x):
        # Encoder forward pass
        x = F.relu(self.fc1(x))
        mean = self.fc2_mean(x)
        logvar = self.fc2_logvar(x)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        # Reparameterization trick for sampling from a normal distribution
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def decode(self, z):
        # Decoder forward pass
        z = F.relu(self.fc3(z))
        recon_x = torch.sigmoid(self.fc4(z))  # Assuming input features are normalized between 0 and 1
        return recon_x

    def forward(self, x):
        # Full forward pass of the VAE
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        recon_x = self.decode(z)
        return recon_x, mean, logvar

In [None]:
def loss_function(reconstructed_output, input_tensor, mu, log_var):
    print("reconstructed_output", reconstructed_output)
    print("input_tensor", input_tensor)
    BCE = nn.functional.mse_loss(reconstructed_output, input_tensor, reduction='sum')
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return BCE + KLD

In [None]:
def train(model, optimizer, train_dataset, epochs, device, x_dim=-1):
    model.train()
    startTotal = time.time()
    for epoch in range(epochs):
        print("\tEpisode", epoch + 1, "/", epochs)
        overall_loss = 0
        start = time.time()
        for batch_idx, (input_tensor, _) in enumerate(train_dataset):
            print("\t\tBatch", batch_idx + 1, "/", len(train_dataset))
            input_tensor = input_tensor.view(batch_size, x_dim).to(device)

            optimizer.zero_grad()

            reconstructed_output, mean, log_var = model(input_tensor)
            loss = loss_function(reconstructed_output, input_tensor, mean, log_var)
            
            overall_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        end = time.time()
        print("\tEpisode Result", "\tAverage Loss: ", overall_loss/(batch_idx*batch_size), "\tStep Time: ", end - start, "s", "\tTotal Time: ", end - startTotal, "s","\n\n")
    return overall_loss

# Create model and train it

In [None]:
# create model and optimizer
input_dim = 2 # corresponds to the number of features in the dataset
latent_dim = 2 # corresponds to the number of latent variables
model = VAE(input_dim, latent_dim).to(device)
optimizer = Adam(model.parameters(), lr=1e-2)

In [None]:
# train model
train(model, optimizer, train_loader, epochs=500, device=device)

In [None]:
# Generate a new sample
def generate_sample(model, latent_dim):
    sample = torch.randn(1, latent_dim)
    return model.decode(sample).detach().numpy()

# Print or use the generated supplier data as needed
print("Generated Supplier Data:")
for i in range(5):
    print(generate_sample(model, latent_dim))