# Imports

In [189]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
import torchtext as tt
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import time

# Data Collecting & vizualisation

In [190]:
# Define a custom dataset class for PyTorch
class SupplierDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the element at the given index
        element = self.dataframe.iloc[idx]
        
        # Flatten the element into a 1D array by looping through the columns and adding them to the array
        flatten_element = []
        for column in element:
            # list
            if isinstance(column, list):
                for item in column:
                    flatten_element.append(item)
            # numpy array
            elif isinstance(column, np.ndarray):
                for item in column:
                    flatten_element.append(item)
            # sub tensor
            elif isinstance(column, torch.Tensor):
                for item in column:
                    flatten_element.append(item)
            else:
                flatten_element.append(column)
        
        # Convert the element to a PyTorch tensor
        tensor = torch.tensor(flatten_element, dtype=torch.float32)
        
        #print(f"shape of tensor: {tensor.shape} and type: {type(tensor)}")
        
        # Apply the transform if one is given
        if self.transform:
            return self.transform(tensor)
        else:
            return tensor

In [191]:
def separate_columns_types(dataset):
    # Separate the text columns from the other columns
    text_columns = ["invoice_code", "customer_name","customer_email","customer_address","customer_city","customer_state","customer_postal_code",
                    "customer_country","notes","created_by","updated_by","shipping_address","shipping_city","shipping_state",
                    "shipping_postal_code","shipping_country"]
    
    date_columns = ["invoice_date","payment_due_date","created_at","updated_at","due_date","paid_date"]

    categorical_columns = ["payment_method","status","currency","payment_reference"]

    numerical_columns = ["invoice_number","subtotal","tax_rate","tax_amount","discount_rate","discount_amount","total","exchange_rate"]  

    # Check if there is a column not in one of the above lists
    for column in dataset.columns:
        if column not in text_columns and column not in date_columns and column not in categorical_columns and column not in numerical_columns:
            print("Column not in any list: " + column) 
            
    return dataset[text_columns], dataset[date_columns], dataset[categorical_columns], dataset[numerical_columns]

In [192]:
def transform_categorical(dataset):

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()
    classes = {}
    
    # Apply a one-hot encoding to all columns of the tensor
    for column in dataset.columns:

        # Convert the strings to unique numerical indices
        unique_classes, indices = np.unique(dataset[column], return_inverse=True)
        classes[column] = unique_classes
        #print("Column: " + column + " - Classes: " + str(unique_classes))

        # Convert the indices to a PyTorch tensor
        tensor_data = torch.tensor(indices)

        # Apply one-hot encoding using torch.nn.functional.one_hot
        one_hot_encoded = F.one_hot(tensor_data)
        
        # Convert the one-hot encoding tensor to a NumPy array
        one_hot_array = one_hot_encoded.numpy()

        # Add the one-hot encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(one_hot_array)

    return transformed_dataset, classes

In [193]:
def transform_numerical(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # Apply a min-max normalization to all columns of the tensor
    for column in dataset.columns:
        # Convert the column to a PyTorch tensor
        tensor_data = torch.tensor(dataset[column], dtype=torch.float32)

        # Range of values in the column
        min_value = torch.min(tensor_data)
        max_value = torch.max(tensor_data)

        # Normalize the values in the column between 0 and 1en
        normalized = (tensor_data - min_value) / (max_value - min_value)

        # Apply min-max normalization using torch.nn.functional.normalize
        #normalized = F.normalize(tensor_data)

        # Convert the normalized tensor to a NumPy array
        normalized_array = normalized.numpy()

        # Add the normalized array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(normalized_array)

    return transformed_dataset

In [194]:
def transform_text(dataset):
    # Prepare embeddings model
    embedding_dim = 100
    glove = GloVe(name='6B', dim=embedding_dim)
    tokenizer = get_tokenizer('basic_english')

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # Apply a word embedding encoding to all columns of the tensor using torchtext
    for column in dataset.columns:
        # Convert the column to a string list
        texts = dataset[column].astype(str).tolist()

        # Convert the text to a list of tokens
        tokens = [tokenizer(t) for t in texts]

        # Convert the tokens to a list of indices
        encoded_data = []
        for token in tokens:
            token_encoded = []
            for word in token:
                if word in glove.stoi:
                    token_encoded.append(glove.stoi[word])
                else:
                    token_encoded.append(0)
            encoded_data.append(token_encoded)

        # Convert the indices to a PyTorch tensor
        if len(encoded_data) <= 0:
            continue

        non_empty_sequences = [torch.tensor(seq) for seq in encoded_data if len(seq) > 0]

        # Pad the sequences to the same length
        padded_sequences = pad_sequence(non_empty_sequences)

        # Add the encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = padded_sequences.tolist() # TODO: fix size mismatch error
        
    return transformed_dataset

In [195]:
def transform_date(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # Apply a timestamp convertion to all columns of the tensor
    for column in dataset.columns:
        # Apply timestamp convertion using pandas.to_datetime
        timestamp = pd.to_datetime(dataset[column]).astype(np.int64) // 10**9

        # Add the timestamp to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(timestamp)

    return transformed_dataset

In [196]:
# Load the csv dataset from the csv file
dataset_path = 'datasets/fake_invoice_1000.csv'
# Creating a DataFrame from the CSV data (replace this with your actual CSV file path)
df = pd.read_csv(dataset_path)

# Process the dataset before creating the SupplierDataset
text_columns, date_columns, categorical_columns, numerical_columns = separate_columns_types(df)
print(f"Text columns: {text_columns.shape}\n"),                 #print(text_columns.head())
print(f"Date columns: {date_columns.shape}\n"),                 #print(date_columns.head())
print(f"Categorical columns: {categorical_columns.shape}\n"),   #print(categorical_columns.head())
print(f"Numerical columns: {numerical_columns.shape}\n"),       #print(numerical_columns.head())

Text columns: (1000, 16)

Date columns: (1000, 6)

Categorical columns: (1000, 4)

Numerical columns: (1000, 8)



(None,)

In [197]:
# Apply transformations to the categorical columns
categorical_colums_treated, classes = transform_categorical(categorical_columns)
print(f"Categorical columns after transformation: {categorical_colums_treated.shape}\n"),   print(categorical_colums_treated.head(), "\n\n")
numerical_columns_treated = transform_numerical(numerical_columns)
print(f"Numerical columns after transformation: {numerical_columns_treated.shape}\n"),       print(numerical_columns_treated.head(), "\n\n")
#text_columns_treated = transform_text(text_columns)
#print(f"Text columns after transformation: {text_columns_treated.shape}\n"),                 print(text_columns_treated.head(), "\n\n")
date_columns_treated = transform_date(date_columns)
print(f"Date columns after transformation: {date_columns_treated.shape}\n"),                 print(date_columns_treated.head(), "\n\n")

# Concatenate the transformed columns
#text_columns_treated, 
df_treated = pd.concat([date_columns_treated, categorical_colums_treated, numerical_columns_treated], axis=1)
print(f"Final dataset: {df_treated.shape}\n"), print(df_treated.head(), "\n\n")

# Check if there is nan values or empty strings
print("Is there null values => ", df_treated.isnull().values.any())
print("Is there nan values => ", df_treated.isna().values.any())
print("Is there empty strings => ", df_treated.isin(['']).values.any())

# Create an instance of the SupplierDataset
supplier_dataset = SupplierDataset(dataframe=df_treated)

Categorical columns after transformation: (1000, 4)

  payment_method        status  \
0      [0, 1, 0]  [0, 0, 0, 1]   
1      [1, 0, 0]  [0, 0, 1, 0]   
2      [1, 0, 0]  [0, 1, 0, 0]   
3      [1, 0, 0]  [1, 0, 0, 0]   
4      [0, 0, 1]  [0, 0, 1, 0]   

                                            currency payment_reference  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [0, 1, 0]  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [0, 1, 0]  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [1, 0, 0]  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...         [0, 1, 0]  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [1, 0, 0]   


Numerical columns after transformation: (1000, 8)

   invoice_number  subtotal  tax_rate  tax_amount  discount_rate  \
0        0.248694  0.250331      0.15    0.038610           0.86   
1        0.771873  0.368514      0.05    0.018907           0.30   
2        0.330406  0.185498      0.20    0.038

# Split the dataset into train and test sets

In [198]:
train_percentage = 0.2
batch_size = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Split the dataset into training and validation sets
train_size = int(0.8 * len(supplier_dataset))
val_size = len(supplier_dataset) - train_size
train_dataset, val_dataset = random_split(supplier_dataset, [train_size, val_size])

# Create DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Print some statistics about the dataset
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Batch size: {batch_size}")

# Print 5 samples from the training dataset
print("\nTraining samples:" + "\n")
e = iter(train_loader)
element = next(e)
print(f"shape of element: {len(element)} and type: {type(element)}")
print(f"first element :\n"+ str(element) + "\n end of first element")

Training samples: 800
Validation samples: 200
Batch size: 50

Training samples:

shape of element: 50 and type: <class 'torch.Tensor'>
first element :
tensor([[1.6670e+09, 1.6572e+09, 1.6505e+09,  ..., 2.4229e-01, 3.4952e-01,
         2.8601e-01],
        [1.6553e+09, 1.6667e+09, 1.6590e+09,  ..., 7.3515e-01, 3.8267e-01,
         1.1385e-01],
        [1.6511e+09, 1.6475e+09, 1.6658e+09,  ..., 1.6955e-01, 3.3012e-01,
         6.2358e-01],
        ...,
        [1.6629e+09, 1.6573e+09, 1.6464e+09,  ..., 2.3176e-01, 5.8376e-01,
         5.6955e-01],
        [1.6421e+09, 1.6430e+09, 1.6440e+09,  ..., 8.3412e-02, 4.5946e-01,
         7.3320e-01],
        [1.6561e+09, 1.6475e+09, 1.6550e+09,  ..., 6.9297e-03, 3.2556e-01,
         5.8167e-02]])
 end of first element


# Classes and functions needed

In [199]:
class Encoder(nn.Module):
    def __init__(self, input_size, latent_size):
        super(Encoder, self).__init__()
        
        # Define dynamic layers for the encoder
        # You can experiment with different architectures
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        
        # Parameters for the mean and log-variance of the latent space
        self.fc_mean = nn.Linear(128, latent_size)
        self.fc_logvar = nn.Linear(128, latent_size)
        
    def forward(self, x):
        print(f"encoder x as input: {x}")
        x = self.fc1(x)
        print(f"encoder x after fc1: {x}")
        x = F.relu(x)
        print(f"encoder x after relu1: {x}")
        x = self.fc2(x)
        print(f"encoder x after fc2: {x}")
        x = F.relu(x)
        print(f"encoder x after relu2: {x}")
        
        # Calculate mean and log-variance for the latent space
        mean = self.fc_mean(x)
        logvar = self.fc_logvar(x)
        print(f"encoder mean: {mean}\n encoder logvar: {logvar}")
        
        return mean, logvar

In [200]:
class Decoder(nn.Module):
    def __init__(self, latent_size, output_size):
        super(Decoder, self).__init__()
        
        # Define dynamic layers for the decoder
        self.fc1 = nn.Linear(latent_size, 128)
        self.fc2 = nn.Linear(128, 256)
        
        # Output layer for reconstruction
        self.fc_out = nn.Linear(256, output_size)
        
    def forward(self, z):
        z = self.fc1(z)
        z = F.relu(z)
        z = self.fc2(z)
        z = F.relu(z)
        
        # Output layer for reconstruction
        x_recon = torch.sigmoid(self.fc_out(z))  # Assuming the data is normalized to [0, 1]
        
        return x_recon

In [201]:
class VAE(nn.Module):
    def __init__(self, input_size, latent_size):
        super(VAE, self).__init__()
        
        # Create instances of the Encoder and Decoder
        self.encoder = Encoder(input_size, latent_size)
        self.decoder = Decoder(latent_size, input_size)
        
    def reparameterize(self, mean, logvar):
        # Reparameterization trick for sampling from a normal distribution
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std
    
    def forward(self, x):
        # Forward pass through the encoder
        mean, logvar = self.encoder.forward(x)
        print(f"forward mean : {mean} and logvar : {logvar}")
        
        # Sample from the latent space using the reparameterization trick
        z = self.reparameterize(mean, logvar)
        
        # Forward pass through the decoder
        x_recon = self.decoder.forward(z)
        
        return x_recon, mean, logvar

In [202]:
def loss_function(recon_x, x, mu, logvar, beta=1.0):

    # Reconstruction Loss (e.g., Mean Squared Error or Binary Cross Entropy)
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')  # Replace with appropriate loss function

    # KL Divergence Loss
    kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    # Combine the Reconstruction Loss and KL Divergence Loss with the weighting factor (beta)
    total_loss = reconstruction_loss + beta * kl_divergence
        
    return total_loss, reconstruction_loss, kl_divergence

In [203]:
def checker(recon_data, mu, logvar):
    # Recon data
    if(recon_data is None):
        print("recon_data is None")
        return False
    if(recon_data.shape[0] != batch_size):
        print("recon_data shape is not batch_size")
        return False
       # check for nan values
    if(torch.isnan(recon_data).any()):
        print("recon_data has nan values")
        return False
    # check for inf values
    if(torch.isinf(recon_data).any()):
        print("recon_data has inf values")
        return False
    # check for negative values
    if((recon_data < 0).any()):
        print("recon_data has negative values")
        return False
    # check for values greater than 1
    if((recon_data > 1).any()):
        print("recon_data has values greater than 1")
        return False
    # check for null values
    if(recon_data.isnull().values.any()):
        print("recon_data has null values")
        return False
    
    # mu
    if(mu is None):
        print("mu is None")
        return False
    if(mu.shape[0] != batch_size):
        print("mu shape is not batch_size")
        return False
    # check for nan values
    if(torch.isnan(mu).any()):
        print("mu has nan values")
        return False
    # check for inf values
    if(torch.isinf(mu).any()):
        print("mu has inf values")
        return False
    # check for null values
    if(mu.isnull().values.any()):
        print("mu has null values")
        return False
    
    # logvar
    if(logvar is None):
        print("logvar is None")
        return False
    if(logvar.shape[0] != batch_size):
        print("logvar shape is not batch_size")
        return False
    # check for nan values
    if(torch.isnan(logvar).any()):
        print("logvar has nan values")
        return False
    # check for inf values
    if(torch.isinf(logvar).any()):
        print("logvar has inf values")
        return False
    # check for null values
    if(logvar.isnull().values.any()):
        print("logvar has null values")
        return False
    
    return True   
      

In [204]:
def train(vae, train_loader, num_epochs=10, learning_rate=1e-3, beta=1.0, device='cuda', with_checker=False):
    # Move the model to the specified device (cuda or cpu)
    vae.to(device)
    
    # Define the optimizer
    optimizer = optim.Adam(vae.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0.0
        recon_loss = 0.0
        kl_loss = 0.0
        
        for batch_idx, data in enumerate(train_loader):
            # Move the batch to the specified device
            data = data.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass through the VAE
            recon_data, mu, logvar = vae.forward(data)
            
            # Check the model return values
            if(with_checker):
                if(not checker(recon_data, mu, logvar)):
                    return
            
            # Calculate the VAE loss
            loss, recon_loss_batch, kl_loss_batch = loss_function(recon_data, data, mu, logvar, beta=beta)
            
            # Backward pass and optimization step
            loss.backward()
            optimizer.step()
            
            # Update the running total of losses
            total_loss += loss.item()
            recon_loss += recon_loss_batch.item()
            kl_loss += kl_loss_batch.item()
            
            # Print logs every N batches (you can adjust this value)
            log_interval = 100
            if batch_idx % log_interval == 0 and batch_idx > 0:
                avg_loss = total_loss / log_interval
                avg_recon_loss = recon_loss / log_interval
                avg_kl_loss = kl_loss / log_interval
                
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], '
                      f'Avg. Loss: {avg_loss:.4f}, Avg. Recon Loss: {avg_recon_loss:.4f}, Avg. KL Loss: {avg_kl_loss:.4f}')
                
                total_loss = 0.0
                recon_loss = 0.0
                kl_loss = 0.0
                
        # Print epoch-level logs
        print(f'Epoch [{epoch+1}/{num_epochs}], Avg. Total Loss: {total_loss:.4f}')
        
    print('Training complete.')

# Create model and train it

In [205]:
# create model and optimizer
# Get the input dimension from the training dataset
input_dim =  len(train_dataset[0])
# Get the latent dimension from the input dimension
latent_dim = input_dim // 2
model = VAE(input_dim, latent_dim).to(device)

In [206]:
print(f"input_dim: {input_dim} and latent_dim: {latent_dim}")
print(f"model: {model}")

input_dim: 127 and latent_dim: 63
model: VAE(
  (encoder): Encoder(
    (fc1): Linear(in_features=127, out_features=256, bias=True)
    (fc2): Linear(in_features=256, out_features=128, bias=True)
    (fc_mean): Linear(in_features=128, out_features=63, bias=True)
    (fc_logvar): Linear(in_features=128, out_features=63, bias=True)
  )
  (decoder): Decoder(
    (fc1): Linear(in_features=63, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=256, bias=True)
    (fc_out): Linear(in_features=256, out_features=127, bias=True)
  )
)


In [207]:
# train model
train(model, train_loader, num_epochs=1, device=device, with_checker=False)

encoder x as input: tensor([[1.6483e+09, 1.6543e+09, 1.6500e+09,  ..., 5.2375e-01, 3.1124e-01,
         7.6877e-01],
        [1.6591e+09, 1.6434e+09, 1.6515e+09,  ..., 9.2551e-02, 1.0994e-01,
         2.4345e-01],
        [1.6536e+09, 1.6526e+09, 1.6674e+09,  ..., 1.5573e-01, 1.5755e-01,
         2.2551e-01],
        ...,
        [1.6413e+09, 1.6664e+09, 1.6650e+09,  ..., 6.0683e-03, 2.6947e-01,
         6.0840e-01],
        [1.6624e+09, 1.6624e+09, 1.6565e+09,  ..., 7.4392e-01, 3.8067e-01,
         2.5852e-01],
        [1.6604e+09, 1.6598e+09, 1.6457e+09,  ..., 1.5979e-01, 2.8648e-01,
         2.2513e-01]])
encoder x after fc1: tensor([[-1.1434e+07, -1.6814e+06,  4.6976e+07,  ..., -1.7721e+07,
          4.4391e+08, -2.5648e+07],
        [-1.2452e+07, -1.2754e+06,  4.7272e+07,  ..., -1.8209e+07,
          4.4424e+08, -2.5056e+07],
        [-1.0817e+07, -6.8521e+05,  4.7066e+07,  ..., -1.5569e+07,
          4.4390e+08, -2.5909e+07],
        ...,
        [-1.0599e+07, -1.3428e+06,  4.754

encoder x as input: tensor([[1.6578e+09, 1.6686e+09, 1.6455e+09,  ..., 2.8773e-01, 3.4739e-01,
         6.4166e-01],
        [1.6568e+09, 1.6603e+09, 1.6584e+09,  ..., 4.4091e-01, 2.6346e-01,
         5.9533e-01],
        [1.6432e+09, 1.6623e+09, 1.6718e+09,  ..., 5.2283e-01, 6.3452e-01,
         7.2144e-01],
        ...,
        [1.6438e+09, 1.6699e+09, 1.6661e+09,  ..., 5.3618e-01, 5.8553e-01,
         7.1775e-01],
        [1.6679e+09, 1.6648e+09, 1.6670e+09,  ..., 2.7912e-01, 6.4750e-01,
         6.5720e-01],
        [1.6694e+09, 1.6698e+09, 1.6415e+09,  ..., 3.7842e-01, 4.3516e-01,
         3.4019e-01]])
encoder x after fc1: tensor([[-12159560.,        nan,        nan,  ..., -16477332.,        nan,
         -26754662.],
        [-13634268.,        nan,        nan,  ..., -16712091.,        nan,
         -24605324.],
        [-10210633.,        nan,        nan,  ..., -14753873.,        nan,
         -24513886.],
        ...,
        [ -9989273.,        nan,        nan,  ..., -1432918

In [208]:
# Generate a new sample
def generate_sample(model, latent_dim):
    sample = torch.randn(1, latent_dim)
    return model.decoder.forward(sample).detach().numpy()

# Print or use the generated supplier data as needed
print("Generated Supplier Data:")
for i in range(5):
    print(generate_sample(model, latent_dim))

Generated Supplier Data:
[[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan]]
[[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan