# Imports

In [188]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
import torchtext as tt
from torchtext.data import Field, BucketIterator, TabularDataset
import time

ImportError: cannot import name 'Field' from 'torchtext.data' (d:\document\GitHub\POC_GAN_Test_Data_Generation\IvaEnv\Lib\site-packages\torchtext\data\__init__.py)

# Data Collecting & vizualisation

In [180]:
# Define a custom dataset class for PyTorch
class SupplierDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]

        # Extract features
        labels = ['id', 'status']
        features = sample[labels].values # ignore name and address for now

        # Apply transformations (e.g., convert strings/categories to numerical values)
        if self.transform:
            features = self.transform(features, labels).astype(np.float32)
        # Convert to PyTorch tensor
        features = torch.tensor(features, dtype=torch.float32)

        return features, 0 # 0 is a dummy label

In [181]:
def separate_columns_types(dataset):
    # Separate the text columns from the other columns
    text_columns = ["invoice_code","customer_name","customer_email","customer_address","customer_city","customer_state","customer_postal_code",
                    "customer_country","notes","created_by","updated_by","shipping_address","shipping_city","shipping_state",
                    "shipping_postal_code","shipping_country"]
    
    date_columns = ["invoice_date","payment_due_date","created_at","updated_at","due_date","paid_date"]

    categorical_columns = ["payment_method","status","currency","payment_reference"]

    numerical_columns = ["invoice_number","subtotal","tax_rate","tax_amount","discount_rate","discount_amount","total","exchange_rate"]  

    # Check if there is a column not in one of the above lists
    for column in dataset.columns:
        if column not in text_columns and column not in date_columns and column not in categorical_columns and column not in numerical_columns:
            print("Column not in any list: " + column) 
            
    return dataset[text_columns], dataset[date_columns], dataset[categorical_columns], dataset[numerical_columns]

In [182]:
def transform_categorical(dataset):

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()
    classes = {}
    
    # Apply a one-hot encoding to all columns of the tensor
    for column in dataset.columns:

        # Convert the strings to unique numerical indices
        unique_classes, indices = np.unique(dataset[column], return_inverse=True)
        classes[column] = unique_classes
        #print("Column: " + column + " - Classes: " + str(unique_classes))

        # Convert the indices to a PyTorch tensor
        tensor_data = torch.tensor(indices)

        # Apply one-hot encoding using torch.nn.functional.one_hot
        one_hot_encoded = F.one_hot(tensor_data)
        
        # Convert the one-hot encoding tensor to a NumPy array
        one_hot_array = one_hot_encoded.numpy()

        # Add the one-hot encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(one_hot_array)

    return transformed_dataset, classes

In [183]:
def transform_numerical(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # Apply a min-max normalization to all columns of the tensor
    for column in dataset.columns:
        # Convert the column to a PyTorch tensor
        tensor_data = torch.tensor(dataset[column], dtype=torch.float32)

        # Range of values in the column
        min_value = torch.min(tensor_data)
        max_value = torch.max(tensor_data)

        # Normalize the values in the column between 0 and 1en
        normalized = (tensor_data - min_value) / (max_value - min_value)

        # Apply min-max normalization using torch.nn.functional.normalize
        #normalized = F.normalize(tensor_data)

        # Convert the normalized tensor to a NumPy array
        normalized_array = normalized.numpy()

        # Add the normalized array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(normalized_array)

    return transformed_dataset

In [184]:
def transform_text(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()
    vocabs = {}

    # Apply a word embedding encoding to all columns of the tensor using torchtext
    for column in dataset.columns:
        # Convert the tensor to a list of strings
        list_data = dataset[column].tolist()

        # Convert the list of strings to a list of lists of strings
        list_data = [[str(item)] for item in list_data]

        # Create a torchtext field
        field = tt.data.Field(tokenize='spacy', lower=True, batch_first=True, include_lengths=True)

        # Create a torchtext dataset
        torchtext_dataset = tt.data.Dataset(list_data, [('text', field)])

        # Build the vocabulary
        field.build_vocab(torchtext_dataset)

        # Convert the text to a PyTorch tensor
        tensor_text = field.process(torchtext_dataset)

        # Convert the tensor to a NumPy array
        tensor_text = tensor_text[0].numpy()

        # Add the text array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(tensor_text)

        # Add the vocabulary to the vocabs dictionary
        vocabs[column] = field.vocab


    return transformed_dataset

In [185]:
def transform_date(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # Apply a timestamp convertion to all columns of the tensor
    for column in dataset.columns:
        # Apply timestamp convertion using pandas.to_datetime
        timestamp = pd.to_datetime(dataset[column]).astype(np.int64) // 10**9

        # Add the timestamp to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(timestamp)

    return transformed_dataset

In [186]:
# Load the csv dataset from the csv file
dataset_path = 'datasets/fake_invoice_1000.csv'
# Creating a DataFrame from the CSV data (replace this with your actual CSV file path)
df = pd.read_csv(dataset_path)

# Process the dataset before creating the SupplierDataset
text_columns, date_columns, categorical_columns, numerical_columns = separate_columns_types(df)
print(f"Text columns: {text_columns.shape}\n"),                 #print(text_columns.head())
print(f"Date columns: {date_columns.shape}\n"),                 #print(date_columns.head())
print(f"Categorical columns: {categorical_columns.shape}\n"),   #print(categorical_columns.head())
print(f"Numerical columns: {numerical_columns.shape}\n"),       #print(numerical_columns.head())

Text columns: (1000, 16)

Date columns: (1000, 6)

Categorical columns: (1000, 4)

Numerical columns: (1000, 8)



(None,)

In [187]:
# Apply transformations to the categorical columns
categorical_colums_treated, classes = transform_categorical(categorical_columns)
print(f"Categorical columns after transformation: {categorical_colums_treated.shape}\n"),   print(categorical_colums_treated.head(), "\n\n")
numerical_columns_treated = transform_numerical(numerical_columns)
print(f"Numerical columns after transformation: {numerical_columns_treated.shape}\n"),       print(numerical_columns_treated.head(), "\n\n")
text_columns_treated = transform_text(text_columns)
print(f"Text columns after transformation: {text_columns_treated.shape}\n"),                 print(text_columns_treated.head(), "\n\n")
date_columns_treated = transform_date(date_columns)
print(f"Date columns after transformation: {date_columns_treated.shape}\n"),                 print(date_columns_treated.head(), "\n\n")

# Concatenate the transformed columns
df_treated = pd.concat([text_columns_treated, date_columns_treated, categorical_colums_treated, numerical_columns_treated], axis=1)
print(f"Final dataset: {df_treated.shape}\n"), print(df_treated.head(), "\n\n")
# Create an instance of the SupplierDataset
supplier_dataset = SupplierDataset(dataframe=df_treated)

Categorical columns after transformation: (1000, 4)

  payment_method        status  \
0      [0, 1, 0]  [0, 0, 0, 1]   
1      [1, 0, 0]  [0, 0, 1, 0]   
2      [1, 0, 0]  [0, 1, 0, 0]   
3      [1, 0, 0]  [1, 0, 0, 0]   
4      [0, 0, 1]  [0, 0, 1, 0]   

                                            currency payment_reference  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [0, 1, 0]  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [0, 1, 0]  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [1, 0, 0]  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...         [0, 1, 0]  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         [1, 0, 0]   


Numerical columns after transformation: (1000, 8)

   invoice_number  subtotal  tax_rate  tax_amount  discount_rate  \
0        0.248694  0.250331      0.15    0.038610           0.86   
1        0.771873  0.368514      0.05    0.018907           0.30   
2        0.330406  0.185498      0.20    0.038

AttributeError: module 'torchtext.data' has no attribute 'Field'

In [None]:

for i in range(5):
    sample = supplier_dataset[i]
    print(f"supplier Sample {i + 1}:", sample)

KeyError: "['id'] not in index"

# Split the dataset into train and test sets

In [None]:
train_percentage = 0.2
batch_size = 2

# Split the dataset into training and validation sets
train_size = int(0.8 * len(supplier_dataset))
val_size = len(supplier_dataset) - train_size
train_dataset, val_dataset = random_split(supplier_dataset, [train_size, val_size])

# Create DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classes and functions needed

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()

        # Encoder layers
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2_mean = nn.Linear(512, latent_dim)
        self.fc2_logvar = nn.Linear(512, latent_dim)

        # Decoder layers
        self.fc3 = nn.Linear(latent_dim, 512)
        self.fc4 = nn.Linear(512, input_dim)

    def encode(self, x):
        # Encoder forward pass
        x = F.relu(self.fc1(x))
        mean = self.fc2_mean(x)
        logvar = self.fc2_logvar(x)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        # Reparameterization trick for sampling from a normal distribution
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def decode(self, z):
        # Decoder forward pass
        z = F.relu(self.fc3(z))
        recon_x = torch.sigmoid(self.fc4(z))  # Assuming input features are normalized between 0 and 1
        return recon_x

    def forward(self, x):
        # Full forward pass of the VAE
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        recon_x = self.decode(z)
        return recon_x, mean, logvar

In [None]:
def loss_function(reconstructed_output, input_tensor, mu, log_var):
    print("reconstructed_output", reconstructed_output)
    print("input_tensor", input_tensor)
    BCE = nn.functional.mse_loss(reconstructed_output, input_tensor, reduction='sum')
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return BCE + KLD

In [None]:
def train(model, optimizer, train_dataset, epochs, device, x_dim=-1):
    model.train()
    startTotal = time.time()
    for epoch in range(epochs):
        print("\tEpisode", epoch + 1, "/", epochs)
        overall_loss = 0
        start = time.time()
        for batch_idx, (input_tensor, _) in enumerate(train_dataset):
            print("\t\tBatch", batch_idx + 1, "/", len(train_dataset))
            input_tensor = input_tensor.view(batch_size, x_dim).to(device)

            optimizer.zero_grad()

            reconstructed_output, mean, log_var = model(input_tensor)
            loss = loss_function(reconstructed_output, input_tensor, mean, log_var)
            
            overall_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        end = time.time()
        print("\tEpisode Result", "\tAverage Loss: ", overall_loss/(batch_idx*batch_size), "\tStep Time: ", end - start, "s", "\tTotal Time: ", end - startTotal, "s","\n\n")
    return overall_loss

# Create model and train it

In [None]:
# create model and optimizer
input_dim = 2 # corresponds to the number of features in the dataset
latent_dim = 2 # corresponds to the number of latent variables
model = VAE(input_dim, latent_dim).to(device)
optimizer = Adam(model.parameters(), lr=1e-2)

In [None]:
# train model
train(model, optimizer, train_loader, epochs=500, device=device)

In [None]:
# Generate a new sample
def generate_sample(model, latent_dim):
    sample = torch.randn(1, latent_dim)
    return model.decode(sample).detach().numpy()

# Print or use the generated supplier data as needed
print("Generated Supplier Data:")
for i in range(5):
    print(generate_sample(model, latent_dim))