# Imports

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
import torchtext as tt
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import time

# Constants
SEPARATOR = '_'

# Data Collecting & vizualisation

In [None]:
# Define a custom dataset class for PyTorch
class SupplierDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the element at the given index
        element = self.dataframe.iloc[idx]
        
        # Convert the element to a PyTorch tensor
        tensor = torch.tensor(element, dtype=torch.float32)
        
        #print(f"shape of tensor: {tensor.shape} and type: {type(tensor)}")
        
        # Apply the transform if one is given
        if self.transform:
            return self.transform(tensor)
        else:
            return tensor

In [None]:
def separate_columns_types(dataset):
    # Separate the text columns from the other columns
    text_columns = ["invoice_code", "customer_name","customer_email","customer_address","customer_city","customer_state","customer_postal_code",
                    "customer_country","notes","created_by","updated_by","shipping_address","shipping_city","shipping_state",
                    "shipping_postal_code","shipping_country"]
    
    date_columns = ["invoice_date","payment_due_date","created_at","updated_at","due_date","paid_date"]

    categorical_columns = ["payment_method","status","currency","payment_reference"]

    numerical_columns = ["invoice_number","subtotal","tax_rate","tax_amount","discount_rate","discount_amount","total","exchange_rate"]  

    # Check if there is a column not in one of the above lists
    for column in dataset.columns:
        if column not in text_columns and column not in date_columns and column not in categorical_columns and column not in numerical_columns:
            print("Column not in any list: " + column) 
            
    return dataset[text_columns], dataset[date_columns], dataset[categorical_columns], dataset[numerical_columns]

In [None]:
class Categorical_Transformer():
    def __init__(self):
        self.classes = []
        self.num_flattent_columns = 0

    # Transform the column from a list of strings to a one-hot encoded array
    def transform(self, column_data, header):
        
        # Convert the strings to unique numerical indices
        unique_classes, indices = np.unique(column_data, return_inverse=True)
        self.classes = unique_classes

        # Convert the indices to a PyTorch tensor
        tensor_data = torch.tensor(indices)

        # Apply one-hot encoding using torch.nn.functional.one_hot
        one_hot_encoded = F.one_hot(tensor_data, len(self.classes))
        
        # Convert the one-hot encoding tensor to a NumPy array
        one_hot_array = one_hot_encoded.numpy()

        # Create a dataframe from the one-hot encoded array with a column for each class in format header_class
        columns = [f"{header}{SEPARATOR}{self.classes[i]}" for i in range(len(self.classes))]
        one_hot_columns = pd.DataFrame(one_hot_array, columns=columns)

        return one_hot_columns

    # Inverse transform the column from a one-hot encoded array to a list of strings
    def inverse_transform(self, columns):

        # convert the numpy array of flattened columns to a 2D array
        column = np.array(columns).reshape(-1, len(self.classes))

        # Convert the one-hot encoded array to a PyTorch tensor
        tensor_data = torch.tensor(column)
    
        # Convert the tensor to a NumPy array
        numpy_data = tensor_data.numpy()
    
        # Convert the one-hot encoded array to a list of strings
        indices = np.argmax(numpy_data, axis=1)
        column_data = self.classes[indices]
    
        return column_data

In [None]:
def transform_categorical(dataset):

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()
    categories = {}
    
    # Apply a one-hot encoding to all columns of the tensor
    for column in dataset.columns:

        # Create a Categorical_Transformer object for the column
        categories[column] = Categorical_Transformer()

        # Apply the transform to the column
        one_hot_columns = categories[column].transform(dataset[column], column)

        # Add the one-hot encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset = transformed_dataset.join(pd.DataFrame(one_hot_columns))

    return transformed_dataset, categories

In [None]:
def test_categorical_column(column):
    # Create a test dataset
    test_dataset = pd.DataFrame({"column1": column})
    print("column to transform : ",column)
    
    # Create a Categorical_Column object
    categorical_column = Categorical_Transformer()
    
    # Transform the column
    transformed_dataset = categorical_column.transform(test_dataset["column1"], "column1")
    print("tranformed : ", transformed_dataset)
    
    # Inverse transform the column
    column_data = categorical_column.inverse_transform(transformed_dataset)
    print("untransformed : ", column_data)
    print("is correct : ", column == column_data, "\n")

# Test the Categorical_Column class
# with few catagories
test_categorical_column(["a", "b", "c", "d", "a", "b", "c", "d", "a", "b", "c", "d"])
# with only one value
test_categorical_column(["a"] * 12)
# with a lot of different values
test_categorical_column([("a"+str(i)) for i in range(100)])

In [None]:
class Numerical_Transformer:
    def __init__(self):
        self.min_value = 0
        self.max_value = 0
        self.type = np.int_

    # Transform the column by normalizing the data
    def transform(self, column_data):

        # Determine the type of the column
        self.type = column_data.dtype

        # Convert the column to a PyTorch tensor
        tensor_data = torch.tensor(column_data, dtype=torch.float32)

        # Range of values in the column
        self.min_value = torch.min(tensor_data)
        self.max_value = torch.max(tensor_data)

        # Normalize the values in the column between 0 and 1en
        normalized = (tensor_data - self.min_value) / (self.max_value - self.min_value)

        # Convert the normalized tensor to a NumPy array
        normalized_array = normalized.numpy()

        return normalized_array

    # Inverse transform the column by denormalizing the data
    def inverse_transform(self, column):
        # Convert the column to a PyTorch tensor
        tensor_data = torch.tensor(column, dtype=torch.float32)

        # Denormalize the values in the column between 0 and 1
        denormalized = tensor_data * (self.max_value - self.min_value) + self.min_value

        # Convert the denormalized tensor to a NumPy array
        denormalized_array = denormalized.numpy()

        # Convert the array to the original type
        denormalized_array_typed = denormalized_array.astype(self.type)

        return denormalized_array_typed

In [None]:
def transform_numerical(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()
    numericals = {}

    # Apply a min-max normalization to all columns of the tensor
    for column in dataset.columns:
        
        numerical_column = Numerical_Transformer()
        normalized_array = numerical_column.transform(dataset[column])

        # Add the numerical column to the numericals dictionary
        numericals[column] = numerical_column

        # Add the normalized array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(normalized_array)

    return transformed_dataset, numericals

In [None]:
def test_numerical_column():
    # Create a numerical column object
    numerical_column = Numerical_Transformer()

    # Create a random column of data
    column_data = np.random.randint(0, 100, size=100)
    print("original", column_data)

    # Transform the column
    transformed_column = numerical_column.transform(column_data)
    print("transformed", transformed_column)

    # Inverse transform the column
    inverse_transformed_column = numerical_column.inverse_transform(transformed_column)
    print("untransformed", inverse_transformed_column)
    for i in range(len(column_data)):
        if column_data[i] != inverse_transformed_column[i]:
            print(f"error : {i} - {column_data[i]} != {inverse_transformed_column[i]}")

# Test the Numerical_Column class
test_numerical_column()

In [None]:
class Date_Transformer():
    def __init__(self, reference_year = 3000, reference_month = 12, reference_day = 31, reference_hour = 24, reference_minute = 60, reference_second = 60):
        self.reference_year = reference_year
        self.reference_month = reference_month
        self.reference_day = reference_day
        self.reference_hour = reference_hour
        self.reference_minute = reference_minute
        self.reference_second = reference_second
        self.column_names = {}

    # Transform the column by separating the date into its components and normalizing the data
    def transform(self, column_data, column_name):
        # Decompose the date into its components
        date = pd.to_datetime(column_data)

        transformed_dataset = pd.DataFrame()

        year = date.dt.year
        # Check if the inital column contain a year
        if not year.empty and not year.all() == 0:
            normalized_year = year / self.reference_year
            year = torch.tensor(normalized_year, dtype=torch.float32)
            column_name_year = f'{column_name}{SEPARATOR}year'
            self.column_names["year"] = column_name_year
            transformed_dataset[column_name_year] = year

        month = date.dt.month
        if not month.empty and not month.all() == 0:
            normalized_month = month / self.reference_month
            month = torch.tensor(normalized_month, dtype=torch.float32)
            column_name_month = f'{column_name}{SEPARATOR}month'
            self.column_names["month"] = column_name_month
            transformed_dataset[column_name_month] = month

        day = date.dt.day
        if not day.empty and not day.all() == 0:
            normalized_day = day / self.reference_day
            day = torch.tensor(normalized_day, dtype=torch.float32)
            column_name_day = f'{column_name}{SEPARATOR}day'
            self.column_names["day"] = column_name_day
            transformed_dataset[column_name_day] = day

        hour = date.dt.hour
        if not hour.empty and not hour.all() == 0:
            normalized_hour = hour / self.reference_hour
            hour = torch.tensor(normalized_hour, dtype=torch.float32)
            column_name_hour = f'{column_name}{SEPARATOR}hour'
            self.column_names["hour"] = column_name_hour
            transformed_dataset[column_name_hour] = hour

        minute = date.dt.minute
        if not minute.empty and not minute.all() == 0:
            normalized_minute = minute / self.reference_minute
            minute = torch.tensor(normalized_minute, dtype=torch.float32)
            column_name_minute = f'{column_name}{SEPARATOR}minute'
            self.column_names["minute"] = column_name_minute
            transformed_dataset[column_name_minute] = minute

        second = date.dt.second
        if not second.empty and not second.all() == 0:
            normalized_second = second / self.reference_second
            second = torch.tensor(normalized_second, dtype=torch.float32)
            column_name_second = f'{column_name}{SEPARATOR}second'
            self.column_names["second"] = column_name_second
            transformed_dataset[column_name_second] = second

        return transformed_dataset
    
    # Inverse transform the column by denormalizing the data and combining the components into a date
    def inverse_transform(self, columns):
        date_components = {}

        # if there is a year column, denormalize it
        if "year" in self.column_names:
            print("year column name : ", self.column_names["year"])
            print("columns : ", columns)
            year = columns[self.column_names["year"]]
            unnormalized_year = year * self.reference_year
            date_components["year"] = unnormalized_year.astype(np.int_)
        
        # if there is a month column, denormalize it
        if "month" in self.column_names:
            month = columns[self.column_names["month"]]
            unnormalized_month = month * self.reference_month
            date_components["month"] = unnormalized_month.astype(np.int_)
            
        # if there is a day column, denormalize it
        if "day" in self.column_names:
            day = columns[self.column_names["day"]]
            unnormalized_day = day * self.reference_day
            date_components["day"] = unnormalized_day.astype(np.int_)

        # if there is a hour column, denormalize it
        if "hour" in self.column_names:
            hour = columns[self.column_names["hour"]]
            unnormalized_hour = hour * self.reference_hour
            date_components["hour"] = unnormalized_hour.astype(np.int_)

        # if there is a minute column, denormalize it
        if "minute" in self.column_names:
            minute = columns[self.column_names["minute"]]
            unnormalized_minute = minute * self.reference_minute
            date_components["minute"] = unnormalized_minute.astype(np.int_)

        # if there is a second column, denormalize it
        if "second" in self.column_names:
            second = columns[self.column_names["second"]]
            unnormalized_second = second * self.reference_second
            date_components["second"] = unnormalized_second.astype(np.int_)

        # Extract the column name from the first column
        column_name = self.column_names["year"].split(SEPARATOR)[0]

        print(date_components)

        # Combine the date components into a date with the format YYYY-MM-DD HH:MM:SS depending on which components are present
        date = pd.to_datetime(date_components, format="%Y-%m-%d %H:%M:%S", errors="coerce")

        df = pd.DataFrame()
        df[column_name] = date

        return df


In [None]:
def transform_date(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()
    dates = {}

    for column in dataset.columns:
        
        date_column = Date_Transformer()
        transformed_date = date_column.transform(dataset[column], column)

        # Add the date column to the dates dictionary
        dates[column] = date_column        

        # Drop the original date column
        transformed_dataset = transformed_dataset.drop(columns=[column])

    return transformed_dataset, dates

In [None]:
def test_date_columns(column):
    dataset = pd.DataFrame()
    dataset["date"] = column

    print("Original column:", column)
    date_column = Date_Transformer()
    transformed_dataset = date_column.transform(dataset["date"], "date")
    print("Transformed dataset:\n", transformed_dataset)

    transformed_datas = {}
    if "date{SEPARATOR}year" in transformed_dataset:
        transformed_datas[f"date{SEPARATOR}year"] = transformed_dataset[f"date{SEPARATOR}year"]
    if "date{SEPARATOR}month" in transformed_dataset:
        transformed_datas[f"date{SEPARATOR}month"] = transformed_dataset[f"date{SEPARATOR}month"]
    if "date{SEPARATOR}day" in transformed_dataset:
        transformed_datas[f"date{SEPARATOR}day"] = transformed_dataset[f"date{SEPARATOR}day"]
    if "date{SEPARATOR}hour" in transformed_dataset:
        transformed_datas[f"date{SEPARATOR}hour"] = transformed_dataset[f"date{SEPARATOR}hour"]
    if "date{SEPARATOR}minute" in transformed_dataset:
        transformed_datas[f"date{SEPARATOR}minute"] = transformed_dataset[f"date{SEPARATOR}minute"]
    if "date{SEPARATOR}second" in transformed_dataset:
        transformed_datas[f"date{SEPARATOR}second"] = transformed_dataset[f"date{SEPARATOR}second"]

    untransformed_dataset = date_column.inverse_transform(transformed_datas)
    print("Untransformed dataset:\n", untransformed_dataset)
    
    # Check if the untransformed dataset is equal to the original dataset and print the wrong dates if it is not
    for i in range(len(dataset)):
        if not dataset["date"][i] == untransformed_dataset["date"][i]:
            print("Wrong date: " + str(dataset["date"][i]) + " != " + str(untransformed_dataset["date"][i]))

# Test the Date_Column class
# with few dates with format YYYY-MM-DD
test_date_columns(["2021-01-01", "2021-02-02", "2021-03-03", "2021-04-04", "2021-05-05"])
# with few dates with format YYYY-MM-DD HH:MM:SS
test_date_columns(["2021-01-01 10:48:22", "2021-02-02 04:21:00", "2021-03-03 12:00:00", "2021-04-04 23:59:59", "2021-05-05 00:00:00"])

In [None]:
def transform_text(dataset):
    # Prepare embeddings model
    # embedding_dim = 100
    # glove = GloVe(name='6B', dim=embedding_dim)
    # tokenizer = get_tokenizer('basic_english')

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # # Apply a word embedding encoding to all columns of the tensor using torchtext
    # for column in dataset.columns:
    #     # Convert the column to a string list
    #     texts = dataset[column].astype(str).tolist()

    #     # Convert the text to a list of tokens
    #     tokens = [tokenizer(t) for t in texts]

    #     # Convert the tokens to a list of indices
    #     encoded_data = []
    #     for token in tokens:
    #         token_encoded = []
    #         for word in token:
    #             if word in glove.stoi:
    #                 token_encoded.append(glove.stoi[word])
    #             else:
    #                 token_encoded.append(0)
    #         encoded_data.append(token_encoded)

    #     # Convert the indices to a PyTorch tensor
    #     if len(encoded_data) <= 0:
    #         continue

    #     non_empty_sequences = [torch.tensor(seq) for seq in encoded_data if len(seq) > 0]

    #     # Pad the sequences to the same length
    #     padded_sequences = pad_sequence(non_empty_sequences)

    #     # Add the encoded array to the transformed dataset
    #     transformed_dataset = transformed_dataset.drop(columns=[column])
    #     transformed_dataset[column] = padded_sequences.tolist() # TODO: fix size mismatch error
        
    return transformed_dataset, 0

In [None]:
# Load the csv dataset from the csv file
dataset_path = 'datasets/fake_invoice_1000.csv'
# Creating a DataFrame from the CSV data (replace this with your actual CSV file path)
df = pd.read_csv(dataset_path)

# Process the dataset before creating the SupplierDataset
text_columns, date_columns, categorical_columns, numerical_columns = separate_columns_types(df)
print(f"Text columns: {text_columns.shape}\n"),                 print(text_columns.head())
print(f"Date columns: {date_columns.shape}\n"),                 print(date_columns.head())
print(f"Categorical columns: {categorical_columns.shape}\n"),   print(categorical_columns.head())
print(f"Numerical columns: {numerical_columns.shape}\n"),       print(numerical_columns.head())

In [None]:
transformers = {}

# Apply transformations to the categorical columns
categorical_columns_treated, transformers["categorical"] = transform_categorical(categorical_columns)
print(f"Categorical columns after transformation: {categorical_columns_treated.shape}\n"),   print(categorical_columns_treated.head(), "\n\n")
numerical_columns_treated, transformers["numerical"] = transform_numerical(numerical_columns)
print(f"Numerical columns after transformation: {numerical_columns_treated.shape}\n"),       print(numerical_columns_treated.head(), "\n\n")
text_columns_treated, transformers["text"] = transform_text(text_columns)
#print(f"Text columns after transformation: {text_columns_treated.shape}\n"),                 print(text_columns_treated.head(), "\n\n")
date_columns_treated, transformers["date"] = transform_date(date_columns)
print(f"Date columns after transformation: {date_columns_treated.shape}\n"),                 print(date_columns_treated.head(), "\n\n")

In [None]:
# Concatenate the transformed columns
#text_columns_treated
df_treated = pd.concat([ numerical_columns_treated, categorical_columns_treated,date_columns_treated], axis=1)
print(f"Final dataset: {df_treated.shape}\n"), print(df_treated.head(), "\n\n")

In [None]:
# Check if there is nan values or empty strings
print("Is there null values => ", df_treated.isnull().values.any())
print("Is there nan values => ", df_treated.isna().values.any())
print("Is there empty strings => ", df_treated.isin(['']).values.any())

In [None]:
# Create an instance of the SupplierDataset
supplier_dataset = SupplierDataset(dataframe=df_treated)
print(f"Supplier dataset: {len(supplier_dataset)}\n shape: {supplier_dataset[0].shape}\n type: {type(supplier_dataset[0])}\n")

# Split the dataset into train and test sets

In [None]:
train_percentage = 0.2
batch_size = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Split the dataset into training and validation sets
train_size = int(0.8 * len(supplier_dataset))
val_size = len(supplier_dataset) - train_size
train_dataset, val_dataset = random_split(supplier_dataset, [train_size, val_size])

# Create DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Print some statistics about the dataset
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Batch size: {batch_size}")

# Print 5 samples from the training dataset
print("\nTraining samples:" + "\n")
e = iter(train_loader)
element = next(e)
print(f"shape of element: {len(element)} and type: {type(element)}")
print(f"first element :\n"+ str(element) + "\n end of first element")

# Classes and functions needed

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, latent_size):
        super(Encoder, self).__init__()
        
        # Define dynamic layers for the encoder
        # You can experiment with different architectures
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        
        # Parameters for the mean and log-variance of the latent space
        self.fc_mean = nn.Linear(128, latent_size)
        self.fc_logvar = nn.Linear(128, latent_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        
        # Calculate mean and log-variance for the latent space
        mean = self.fc_mean(x)
        logvar = self.fc_logvar(x)
        
        return mean, logvar

In [None]:
class Decoder(nn.Module):
    def __init__(self, latent_size, output_size):
        super(Decoder, self).__init__()
        
        # Define dynamic layers for the decoder
        self.fc1 = nn.Linear(latent_size, 128)
        self.fc2 = nn.Linear(128, 256)
        
        # Output layer for reconstruction
        self.fc_out = nn.Linear(256, output_size)
        
    def forward(self, z):
        z = self.fc1(z)
        z = F.relu(z)
        z = self.fc2(z)
        z = F.relu(z)
        
        # Output layer for reconstruction
        x_recon = torch.sigmoid(self.fc_out(z))  # Assuming the data is normalized to [0, 1]
        
        return x_recon

In [None]:
class VAE(nn.Module):
    def __init__(self, input_size, latent_size):
        super(VAE, self).__init__()
        self.input_size = input_size
        self.latent_size = latent_size
        
        # Create instances of the Encoder and Decoder
        self.encoder = Encoder(input_size, latent_size)
        self.decoder = Decoder(latent_size, input_size)
        
    def reparameterize(self, mean, logvar):
        # Reparameterization trick for sampling from a normal distribution
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std
    
    def forward(self, x):
        # Forward pass through the encoder
        mean, logvar = self.encoder.forward(x)
        print(f"forward mean : {mean} and logvar : {logvar}")
        
        # Sample from the latent space using the reparameterization trick
        z = self.reparameterize(mean, logvar)
        
        # Forward pass through the decoder
        x_recon = self.decoder.forward(z)
        
        return x_recon, mean, logvar

In [None]:
def loss_function(recon_x, x, mu, logvar, beta=1.0):

    # Reconstruction Loss (e.g., Mean Squared Error or Binary Cross Entropy)
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')  # Replace with appropriate loss function

    # KL Divergence Loss
    kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    # Combine the Reconstruction Loss and KL Divergence Loss with the weighting factor (beta)
    total_loss = reconstruction_loss + beta * kl_divergence
        
    return total_loss, reconstruction_loss, kl_divergence

In [None]:
def checker(recon_data, mu, logvar):
    # Recon data
    if(recon_data is None):
        print("recon_data is None")
        return False
    if(recon_data.shape[0] != batch_size):
        print("recon_data shape is not batch_size")
        return False
       # check for nan values
    if(torch.isnan(recon_data).any()):
        print("recon_data has nan values")
        return False
    # check for inf values
    if(torch.isinf(recon_data).any()):
        print("recon_data has inf values")
        return False
    # check for negative values
    if((recon_data < 0).any()):
        print("recon_data has negative values")
        return False
    # check for values greater than 1
    if((recon_data > 1).any()):
        print("recon_data has values greater than 1")
        return False
    # check for null values
    if(recon_data.isnull().values.any()):
        print("recon_data has null values")
        return False
    
    # mu
    if(mu is None):
        print("mu is None")
        return False
    if(mu.shape[0] != batch_size):
        print("mu shape is not batch_size")
        return False
    # check for nan values
    if(torch.isnan(mu).any()):
        print("mu has nan values")
        return False
    # check for inf values
    if(torch.isinf(mu).any()):
        print("mu has inf values")
        return False
    # check for null values
    if(mu.isnull().values.any()):
        print("mu has null values")
        return False
    
    # logvar
    if(logvar is None):
        print("logvar is None")
        return False
    if(logvar.shape[0] != batch_size):
        print("logvar shape is not batch_size")
        return False
    # check for nan values
    if(torch.isnan(logvar).any()):
        print("logvar has nan values")
        return False
    # check for inf values
    if(torch.isinf(logvar).any()):
        print("logvar has inf values")
        return False
    # check for null values
    if(logvar.isnull().values.any()):
        print("logvar has null values")
        return False
    
    return True   
      

In [None]:
def train(vae, train_loader, num_epochs=10, learning_rate=1e-3, beta=1.0, device='cuda', with_checker=False):
    # Move the model to the specified device (cuda or cpu)
    vae.to(device)
    
    # Define the optimizer
    optimizer = optim.Adam(vae.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0.0
        recon_loss = 0.0
        kl_loss = 0.0
        
        for batch_idx, data in enumerate(train_loader):
            # Move the batch to the specified device
            data = data.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass through the VAE
            recon_data, mu, logvar = vae.forward(data)
            
            # Check the model return values
            if(with_checker):
                if(not checker(recon_data, mu, logvar)):
                    return
            
            # Calculate the VAE loss
            loss, recon_loss_batch, kl_loss_batch = loss_function(recon_data, data, mu, logvar, beta=beta)
            
            # Backward pass and optimization step
            loss.backward()
            optimizer.step()
            
            # Update the running total of losses
            total_loss += loss.item()
            recon_loss += recon_loss_batch.item()
            kl_loss += kl_loss_batch.item()
            
            # Print logs every N batches (you can adjust this value)
            log_interval = 100
            if batch_idx % log_interval == 0 and batch_idx > 0:
                avg_loss = total_loss / log_interval
                avg_recon_loss = recon_loss / log_interval
                avg_kl_loss = kl_loss / log_interval
                
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], '
                      f'Avg. Loss: {avg_loss:.4f}, Avg. Recon Loss: {avg_recon_loss:.4f}, Avg. KL Loss: {avg_kl_loss:.4f}')
                
                total_loss = 0.0
                recon_loss = 0.0
                kl_loss = 0.0
                
        # Print epoch-level logs
        print(f'Epoch [{epoch+1}/{num_epochs}], Avg. Total Loss: {total_loss:.4f}')
        
    print('Training complete.')

# Create model and train it

In [None]:
# create model and optimizer
# Get the input dimension from the training dataset
input_dim =  len(train_dataset[1])
print(f"input_dim : {input_dim}")
# Get the latent dimension from the input dimension
latent_dim = input_dim // 2
model = VAE(input_dim, latent_dim).to(device)

In [None]:
print(f"input_dim: {input_dim} and latent_dim: {latent_dim}")
print(f"model: {model}")

In [None]:
# train model
train(model, train_loader, num_epochs=1, device=device, with_checker=False)

# Generate a new data to test

In [None]:
def add_headers(headers, generated_data):
    # Add the headers to the generated data
    return pd.DataFrame(generated_data, columns=headers)

In [None]:
# Generate a new sample
def generate_sample(model, headers):
    sample = torch.randn(1, model.latent_size)
    generated_data = model.decoder.forward(sample).detach().numpy()
    return add_headers(headers, generated_data)

In [None]:
new_sample = generate_sample(model, df_treated.columns)

print(new_sample.shape)
print(df_treated.shape)

In [None]:
# Print or use the generated supplier data as needed
print("Generated Supplier Data:")
for i in range(5):
    print(generate_sample(model, df_treated.columns))

# Save the model
timestamp = time.strftime("%Y%m%d-%H%M%S")
torch.save(model.state_dict(), f'model_{timestamp}.pt')

# Generate a new sample
new_sample = generate_sample(model, df_treated.columns)

# PostProcess the data

In [None]:
num_original_columns = df.shape
num_treated_columns = df_treated.shape
num_generated_columns = new_sample.shape

print(f"Original columns: {num_original_columns}")
print(f"Treated columns: {num_treated_columns}")
print(f"Generated columns: {num_generated_columns}")

print(transformers)
print(new_sample.head())

In [None]:
def is__in_categorical_tranfomer(column_name, transformers):
    # Parse the column name
    parsed_column = column_name.split("_")[0]
    
    return parsed_column in transformers["categorical"].keys()

def get_categorical_inverse(column, column_name, transformers):
    # Parse the column name
    parsed_column = column_name.split("_")[0]
    
    # Get the transformer for the column
    transformer = transformers["categorical"][parsed_column]
    
    # Inverse transform the column
    inversed_column = transformer.inverse_transform(column)

    return pd.DataFrame(inversed_column, columns=[column_name])

In [None]:
def is__in_numerical_tranfomer(column_name, transformers):
    return column_name in transformers["numerical"].keys()

def get_numerical_inverse(column, column_name, transformers):
    # Get the transformer for the column
    transformer = transformers["numerical"][column_name]
    
    # Inverse transform the column
    inversed_column = transformer.inverse_transform(column)

    return pd.DataFrame(inversed_column, columns=[column_name])

In [None]:
def is__in_date_tranfomer(column_name, transformers):
    # Parse the column name
    parsed_column = column_name.split("_")[0]
    
    return parsed_column in transformers["date"].keys()

def get_date_inverse(column, column_name, transformers):
    # Parse the column name
    parsed_column = column_name.split("_")[0]
    
    # Get the transformer for the column
    transformer = transformers["date"][parsed_column]
    
    # Inverse transform the column
    inversed_column = transformer.inverse_transform(column)

    return pd.DataFrame(inversed_column, columns=[column_name])

In [None]:
# Parse the new sample columns to find the group of columns related to the same original column
columns_groups = {}

# initialise the columns_groups with the original columns
for column in df.columns:
    columns_groups[column] = []

print("Before : ", columns_groups)

for column in new_sample.columns:
    # Try parsing the column name
    parsed_column = column.split(SEPARATOR)[0]

    print(f"column : {column} -> {parsed_column}")

    # Check wich transformer to use
    if parsed_column in transformers["categorical"].keys():
        columns_groups[parsed_column].append(column)
    elif parsed_column in transformers["numerical"].keys():
        columns_groups[parsed_column].append(column)
    elif parsed_column in transformers["date"].keys():
        columns_groups[parsed_column].append(column)

print("After : ", columns_groups)

In [None]:
# Parse the new sample column names to apply correct transformer
new_sample_columns = new_sample.columns
new_sample_transformed = pd.DataFrame()

# Apply the inverse transformations to the generated sample
for column_name in new_sample_columns:
    print(f'Column : {column_name}')

    if is__in_categorical_tranfomer(column_name, transformers):
        new_sample_transformed.join(get_categorical_inverse(new_sample[column_name], column_name, transformers))
    elif is__in_numerical_tranfomer(column_name, transformers):
        new_sample_transformed.join(get_numerical_inverse(new_sample[column_name], column_name, transformers))
    elif is__in_date_tranfomer(column_name, transformers):
        new_sample_transformed.join(get_date_inverse(new_sample[column_name], column_name, transformers))
    # elif column in transformers["text"].keys():
    #     inverse_transformed = transformers["text"][column].inverse_transform(new_sample[column])
    #     new_sample_transformed[column] = inverse_transformed

In [None]:
# Print the transformed generated sample in the format 'column: value'
for column_name in new_sample_transformed.columns:
    print(f"{column_name}: {new_sample_transformed[column_name][0]}")

In [None]:
def reverse_transform_categorical(transformer, categorical_colums):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = categorical_colums.copy()

    # Apply a one-hot encoding to all columns of the tensor
    for column in categorical_colums.columns:
        # Convert the one-hot encoded array to a PyTorch tensor
        tensor_data = torch.tensor(column)
    
        # Convert the tensor to a NumPy array
        numpy_data = tensor_data.numpy()
    
        # Convert the one-hot encoded array to a list of strings
        indices = np.argmax(numpy_data, axis=1)
        column_data = transformer[column].inverse_transform(indices)
    
        # Add the encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = column_data

    return transformed_dataset

In [None]:
def reverse_transform_numerical(tranformer, numerical_columns):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = numerical_columns.copy()

    # Apply a min-max normalization to all columns of the tensor
    for column in numerical_columns.columns:
        # Convert the column to a PyTorch tensor
        tensor_data = torch.tensor(numerical_columns[column], dtype=torch.float32)

        # Range of values in the column
        min_value = torch.min(tensor_data)
        max_value = torch.max(tensor_data)

        # Normalize the values in the column between 0 and 1en
        normalized = (tensor_data - min_value) / (max_value - min_value)

        # Apply min-max normalization using torch.nn.functional.normalize
        #normalized = F.normalize(tensor_data)

        # Convert the normalized tensor to a NumPy array
        normalized_array = normalized.numpy()

        # Add the normalized array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(normalized_array)

    return transformed_dataset

In [None]:
def reverse_transform_text(text_columns):
    # Prepare embeddings model
    embedding_dim = 100
    glove = GloVe(name='6B', dim=embedding_dim)
    tokenizer = get_tokenizer('basic_english')

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = text_columns.copy()

    # Apply a word embedding encoding to all columns of the tensor using torchtext
    for column in text_columns.columns:
        # Convert the column to a string list
        texts = text_columns[column].astype(str).tolist()

        # Convert the text to a list of tokens
        tokens = [tokenizer(t) for t in texts]

        # Convert the tokens to a list of indices
        encoded_data = []
        for token in tokens:
            token_encoded = []
            for word in token:
                if word in glove.stoi:
                    token_encoded.append(glove.stoi[word])
                else:
                    token_encoded.append(0)
            encoded_data.append(token_encoded)

        # Convert the indices to a PyTorch tensor
        if len(encoded_data) <= 0:
            continue

        non_empty_sequences = [torch.tensor(seq) for seq in encoded_data if len(seq) > 0]

        # Pad the sequences to the same length
        padded_sequences = pad_sequence(non_empty_sequences)

        # Add the encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = padded_sequences.tolist() # TODO: fix size mismatch error
        
    return transformed_dataset

In [None]:
def reverse_transform_date(date_columns):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = date_columns.copy()

    for column in date_columns.columns:
        # Decompose the date into its components
        date = pd.to_datetime(date_columns[column])

        year = date.dt.year
        # Check if the inital column contain a year
        if not year.empty and not year.all() == 0:
            year = torch.tensor(year, dtype=torch.float32)
            transformed_dataset[column + "_year"] = year

        month = date.dt.month
        if not month.empty and not month.all() == 0:
            month = torch.tensor(month / 12, dtype=torch.float32)
            transformed_dataset[column + "_month"] = month

        day = date.dt.day
        if not day.empty and not day.all() == 0:
            day = torch.tensor(day / 31, dtype=torch.float32)
            transformed_dataset[column + "_day"] = day

        hour = date.dt.hour
        if not hour.empty and not hour.all() == 0:
            hour = torch.tensor(hour / 24, dtype=torch.float32)
            transformed_dataset[column + "_hour"] = hour

        minute = date.dt.minute
        if not minute.empty and not minute.all() == 0:
            minute = torch.tensor(minute / 60, dtype=torch.float32)
            transformed_dataset[column + "_minute"] = minute

        second = date.dt.second
        if not second.empty and not second.all() == 0:
            second = torch.tensor(second / 60, dtype=torch.float32)
            transformed_dataset[column + "_second"] = second
        

        # Drop the original date column
        transformed_dataset = transformed_dataset.drop(columns=[column])

    return transformed_dataset

In [None]:
def split_by_type(sample, categorical_columns, numerical_columns, text_columns, date_columns):
    
    # Retrieve the columns from the sample and assign the right column name
    sample = pd.DataFrame(sample)
    sample.columns = categorical_columns.columns.tolist() + numerical_columns.columns.tolist() + text_columns.columns.tolist() + date_columns.columns.tolist()
            
    return sample[text_columns], sample[date_columns], sample[categorical_columns], sample[numerical_columns]

In [None]:
categorical_colums_result, numerical_columns_result, text_columns_result, date_columns_result = split_by_type(new_sample, categorical_columns_treated.columns, numerical_columns_treated.columns, text_columns_treated.columns, date_columns_treated.columns)


# Reverse the transformations applied to the categorical columns
categorical_columns_reversed = reverse_transform_categorical(transformers["categorical"], categorical_colums_result)

# Reverse the transformations applied to the numerical columns
numerical_columns_reversed = reverse_transform_numerical(transformers["numerical"], numerical_columns_result)

# Reverse the transformations applied to the text columns
#text_columns_reversed = reverse_transform_text(transformers["text"], text_columns_result)

# Reverse the transformations applied to the date columns
#date_columns_reversed = reverse_transform_date(transformers["date"], date_columns_result)

# Concatenate the reversed columns back into a single dataframe
#text_columns_reversed, date_columns_reversed
df_reversed = pd.concat([categorical_columns_reversed, numerical_columns_reversed], axis=1)

# Print the reversed dataframe
print(df_reversed)
