# Imports

In [154]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
import torchtext as tt
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import time

# Data Collecting & vizualisation

In [155]:
# Define a custom dataset class for PyTorch
class SupplierDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the element at the given index
        element = self.dataframe.iloc[idx]
        
        # Flatten the element into a 1D array by looping through the columns and adding them to the array
        flatten_element = []
        for column in element:
            # list
            if isinstance(column, list):
                for item in column:
                    flatten_element.append(item)
            # numpy array
            elif isinstance(column, np.ndarray):
                for item in column:
                    flatten_element.append(item)
            # sub tensor
            elif isinstance(column, torch.Tensor):
                for item in column:
                    flatten_element.append(item)
            else:
                flatten_element.append(column)
        
        # Convert the element to a PyTorch tensor
        tensor = torch.tensor(flatten_element, dtype=torch.float32)
        
        #print(f"shape of tensor: {tensor.shape} and type: {type(tensor)}")
        
        # Apply the transform if one is given
        if self.transform:
            return self.transform(tensor)
        else:
            return tensor

In [156]:
def separate_columns_types(dataset):
    # Separate the text columns from the other columns
    text_columns = ["invoice_code", "customer_name","customer_email","customer_address","customer_city","customer_state","customer_postal_code",
                    "customer_country","notes","created_by","updated_by","shipping_address","shipping_city","shipping_state",
                    "shipping_postal_code","shipping_country"]
    
    date_columns = ["invoice_date","payment_due_date","created_at","updated_at","due_date","paid_date"]

    categorical_columns = ["payment_method","status","currency","payment_reference"]

    numerical_columns = ["invoice_number","subtotal","tax_rate","tax_amount","discount_rate","discount_amount","total","exchange_rate"]  

    # Check if there is a column not in one of the above lists
    for column in dataset.columns:
        if column not in text_columns and column not in date_columns and column not in categorical_columns and column not in numerical_columns:
            print("Column not in any list: " + column) 
            
    return dataset[text_columns], dataset[date_columns], dataset[categorical_columns], dataset[numerical_columns]

In [157]:
def transform_categorical(dataset):

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()
    classes = {}
    
    # Apply a one-hot encoding to all columns of the tensor
    for column in dataset.columns:

        # Convert the strings to unique numerical indices
        unique_classes, indices = np.unique(dataset[column], return_inverse=True)
        classes[column] = unique_classes
        #print("Column: " + column + " - Classes: " + str(unique_classes))

        # Convert the indices to a PyTorch tensor
        tensor_data = torch.tensor(indices)

        # Apply one-hot encoding using torch.nn.functional.one_hot
        one_hot_encoded = F.one_hot(tensor_data)
        
        # Convert the one-hot encoding tensor to a NumPy array
        one_hot_array = one_hot_encoded.numpy()

        # Add the one-hot encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(one_hot_array)

    return transformed_dataset, classes

In [158]:
def transform_numerical(dataset):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # Apply a min-max normalization to all columns of the tensor
    for column in dataset.columns:
        # Convert the column to a PyTorch tensor
        tensor_data = torch.tensor(dataset[column], dtype=torch.float32)

        # Range of values in the column
        min_value = torch.min(tensor_data)
        max_value = torch.max(tensor_data)

        # Normalize the values in the column between 0 and 1en
        normalized = (tensor_data - min_value) / (max_value - min_value)

        # Apply min-max normalization using torch.nn.functional.normalize
        #normalized = F.normalize(tensor_data)

        # Convert the normalized tensor to a NumPy array
        normalized_array = normalized.numpy()

        # Add the normalized array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(normalized_array)

    return transformed_dataset

In [159]:
def transform_text(dataset):
    # Prepare embeddings model
    # embedding_dim = 100
    # glove = GloVe(name='6B', dim=embedding_dim)
    # tokenizer = get_tokenizer('basic_english')

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    # # Apply a word embedding encoding to all columns of the tensor using torchtext
    # for column in dataset.columns:
    #     # Convert the column to a string list
    #     texts = dataset[column].astype(str).tolist()

    #     # Convert the text to a list of tokens
    #     tokens = [tokenizer(t) for t in texts]

    #     # Convert the tokens to a list of indices
    #     encoded_data = []
    #     for token in tokens:
    #         token_encoded = []
    #         for word in token:
    #             if word in glove.stoi:
    #                 token_encoded.append(glove.stoi[word])
    #             else:
    #                 token_encoded.append(0)
    #         encoded_data.append(token_encoded)

    #     # Convert the indices to a PyTorch tensor
    #     if len(encoded_data) <= 0:
    #         continue

    #     non_empty_sequences = [torch.tensor(seq) for seq in encoded_data if len(seq) > 0]

    #     # Pad the sequences to the same length
    #     padded_sequences = pad_sequence(non_empty_sequences)

    #     # Add the encoded array to the transformed dataset
    #     transformed_dataset = transformed_dataset.drop(columns=[column])
    #     transformed_dataset[column] = padded_sequences.tolist() # TODO: fix size mismatch error
        
    return transformed_dataset

In [160]:
def transform_date(dataset, reference_year = 3000):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = dataset.copy()

    for column in dataset.columns:
        # Decompose the date into its components
        date = pd.to_datetime(dataset[column])

        year = date.dt.year
        # Check if the inital column contain a year
        if not year.empty and not year.all() == 0:
            year = torch.tensor(year/reference_year, dtype=torch.float32)
            transformed_dataset[column + "_year"] = year

        month = date.dt.month
        if not month.empty and not month.all() == 0:
            month = torch.tensor(month / 12, dtype=torch.float32)
            transformed_dataset[column + "_month"] = month

        day = date.dt.day
        if not day.empty and not day.all() == 0:
            day = torch.tensor(day / 31, dtype=torch.float32)
            transformed_dataset[column + "_day"] = day

        hour = date.dt.hour
        if not hour.empty and not hour.all() == 0:
            hour = torch.tensor(hour / 24, dtype=torch.float32)
            transformed_dataset[column + "_hour"] = hour

        minute = date.dt.minute
        if not minute.empty and not minute.all() == 0:
            minute = torch.tensor(minute / 60, dtype=torch.float32)
            transformed_dataset[column + "_minute"] = minute

        second = date.dt.second
        if not second.empty and not second.all() == 0:
            second = torch.tensor(second / 60, dtype=torch.float32)
            transformed_dataset[column + "_second"] = second
        

        # Drop the original date column
        transformed_dataset = transformed_dataset.drop(columns=[column])

    return transformed_dataset

In [161]:
# Load the csv dataset from the csv file
dataset_path = 'datasets/fake_invoice_1000.csv'
# Creating a DataFrame from the CSV data (replace this with your actual CSV file path)
df = pd.read_csv(dataset_path)

# Process the dataset before creating the SupplierDataset
text_columns, date_columns, categorical_columns, numerical_columns = separate_columns_types(df)
print(f"Text columns: {text_columns.shape}\n"),                 #print(text_columns.head())
print(f"Date columns: {date_columns.shape}\n"),                 #print(date_columns.head())
print(f"Categorical columns: {categorical_columns.shape}\n"),   #print(categorical_columns.head())
print(f"Numerical columns: {numerical_columns.shape}\n"),       #print(numerical_columns.head())

Text columns: (1000, 16)

Date columns: (1000, 6)

Categorical columns: (1000, 4)

Numerical columns: (1000, 8)



(None,)

In [162]:
# Apply transformations to the categorical columns
categorical_columns_treated, classes = transform_categorical(categorical_columns)
#print(f"Categorical columns after transformation: {categorical_columns_treated.shape}\n"),   print(categorical_columns_treated.head(), "\n\n")
numerical_columns_treated = transform_numerical(numerical_columns)
#print(f"Numerical columns after transformation: {numerical_columns_treated.shape}\n"),       print(numerical_columns_treated.head(), "\n\n")
text_columns_treated = transform_text(text_columns)
#print(f"Text columns after transformation: {text_columns_treated.shape}\n"),                 print(text_columns_treated.head(), "\n\n")
date_columns_treated = transform_date(date_columns)
#print(f"Date columns after transformation: {date_columns_treated.shape}\n"),                 print(date_columns_treated.head(), "\n\n")

# Concatenate the transformed columns
#text_columns_treated
df_treated = pd.concat([ numerical_columns_treated, categorical_columns_treated,date_columns_treated], axis=1)
print(f"Final dataset: {df_treated.shape}\n"), print(df_treated.head(), "\n\n")

# Check if there is nan values or empty strings
print("Is there null values => ", df_treated.isnull().values.any())
print("Is there nan values => ", df_treated.isna().values.any())
print("Is there empty strings => ", df_treated.isin(['']).values.any())

# Create an instance of the SupplierDataset
supplier_dataset = SupplierDataset(dataframe=df_treated)
print(f"Supplier dataset: {len(supplier_dataset)}\n shape: {supplier_dataset[0].shape}\n type: {type(supplier_dataset[0])}\n")

Final dataset: (1000, 30)

   invoice_number  subtotal  tax_rate  tax_amount  discount_rate  \
0        0.248694  0.250331      0.15    0.038610           0.86   
1        0.771873  0.368514      0.05    0.018907           0.30   
2        0.330406  0.185498      0.20    0.038234           0.38   
3        0.507838  0.653593      0.60    0.401614           0.90   
4        0.311284  0.588111      0.85    0.512093           0.76   

   discount_amount     total  exchange_rate payment_method        status  ...  \
0         0.226812  0.133221       0.977807      [0, 1, 0]  [0, 0, 0, 1]  ...   
1         0.116232  0.281886       0.115933      [1, 0, 0]  [0, 0, 1, 0]  ...   
2         0.074432  0.140233       0.882845      [1, 0, 0]  [0, 1, 0, 0]  ...   
3         0.617249  0.389216       0.711891      [1, 0, 0]  [1, 0, 0, 0]  ...   
4         0.469142  0.413138       0.172187      [0, 0, 1]  [0, 0, 1, 0]  ...   

  created_at_day updated_at_year  updated_at_month  updated_at_day  \
0      

# Split the dataset into train and test sets

In [163]:
train_percentage = 0.2
batch_size = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Split the dataset into training and validation sets
train_size = int(0.8 * len(supplier_dataset))
val_size = len(supplier_dataset) - train_size
train_dataset, val_dataset = random_split(supplier_dataset, [train_size, val_size])

# Create DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Print some statistics about the dataset
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Batch size: {batch_size}")

# Print 5 samples from the training dataset
print("\nTraining samples:" + "\n")
e = iter(train_loader)
element = next(e)
print(f"shape of element: {len(element)} and type: {type(element)}")
print(f"first element :\n"+ str(element) + "\n end of first element")

Training samples: 800
Validation samples: 200
Batch size: 50

Training samples:

shape of element: 50 and type: <class 'torch.Tensor'>
first element :
tensor([[0.9350, 0.2189, 0.8500,  ..., 0.6740, 0.1667, 0.2903],
        [0.7331, 0.8542, 0.2500,  ..., 0.6740, 0.5000, 0.4194],
        [0.6526, 0.5327, 0.7000,  ..., 0.6740, 0.9167, 0.4516],
        ...,
        [0.8568, 0.1799, 0.0500,  ..., 0.6740, 0.4167, 0.7742],
        [0.8262, 0.1671, 0.2000,  ..., 0.6740, 0.3333, 0.9677],
        [0.5321, 0.5473, 1.0000,  ..., 0.6740, 0.9167, 0.5806]])
 end of first element


# Classes and functions needed

In [164]:
class Encoder(nn.Module):
    def __init__(self, input_size, latent_size):
        super(Encoder, self).__init__()
        
        # Define dynamic layers for the encoder
        # You can experiment with different architectures
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        
        # Parameters for the mean and log-variance of the latent space
        self.fc_mean = nn.Linear(128, latent_size)
        self.fc_logvar = nn.Linear(128, latent_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        
        # Calculate mean and log-variance for the latent space
        mean = self.fc_mean(x)
        logvar = self.fc_logvar(x)
        
        return mean, logvar

In [165]:
class Decoder(nn.Module):
    def __init__(self, latent_size, output_size):
        super(Decoder, self).__init__()
        
        # Define dynamic layers for the decoder
        self.fc1 = nn.Linear(latent_size, 128)
        self.fc2 = nn.Linear(128, 256)
        
        # Output layer for reconstruction
        self.fc_out = nn.Linear(256, output_size)
        
    def forward(self, z):
        z = self.fc1(z)
        z = F.relu(z)
        z = self.fc2(z)
        z = F.relu(z)
        
        # Output layer for reconstruction
        x_recon = torch.sigmoid(self.fc_out(z))  # Assuming the data is normalized to [0, 1]
        
        return x_recon

In [166]:
class VAE(nn.Module):
    def __init__(self, input_size, latent_size):
        super(VAE, self).__init__()
        self.input_size = input_size
        self.latent_size = latent_size
        
        # Create instances of the Encoder and Decoder
        self.encoder = Encoder(input_size, latent_size)
        self.decoder = Decoder(latent_size, input_size)
        
    def reparameterize(self, mean, logvar):
        # Reparameterization trick for sampling from a normal distribution
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std
    
    def forward(self, x):
        # Forward pass through the encoder
        mean, logvar = self.encoder.forward(x)
        print(f"forward mean : {mean} and logvar : {logvar}")
        
        # Sample from the latent space using the reparameterization trick
        z = self.reparameterize(mean, logvar)
        
        # Forward pass through the decoder
        x_recon = self.decoder.forward(z)
        
        return x_recon, mean, logvar

In [167]:
def loss_function(recon_x, x, mu, logvar, beta=1.0):

    # Reconstruction Loss (e.g., Mean Squared Error or Binary Cross Entropy)
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')  # Replace with appropriate loss function

    # KL Divergence Loss
    kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    # Combine the Reconstruction Loss and KL Divergence Loss with the weighting factor (beta)
    total_loss = reconstruction_loss + beta * kl_divergence
        
    return total_loss, reconstruction_loss, kl_divergence

In [168]:
def checker(recon_data, mu, logvar):
    # Recon data
    if(recon_data is None):
        print("recon_data is None")
        return False
    if(recon_data.shape[0] != batch_size):
        print("recon_data shape is not batch_size")
        return False
       # check for nan values
    if(torch.isnan(recon_data).any()):
        print("recon_data has nan values")
        return False
    # check for inf values
    if(torch.isinf(recon_data).any()):
        print("recon_data has inf values")
        return False
    # check for negative values
    if((recon_data < 0).any()):
        print("recon_data has negative values")
        return False
    # check for values greater than 1
    if((recon_data > 1).any()):
        print("recon_data has values greater than 1")
        return False
    # check for null values
    if(recon_data.isnull().values.any()):
        print("recon_data has null values")
        return False
    
    # mu
    if(mu is None):
        print("mu is None")
        return False
    if(mu.shape[0] != batch_size):
        print("mu shape is not batch_size")
        return False
    # check for nan values
    if(torch.isnan(mu).any()):
        print("mu has nan values")
        return False
    # check for inf values
    if(torch.isinf(mu).any()):
        print("mu has inf values")
        return False
    # check for null values
    if(mu.isnull().values.any()):
        print("mu has null values")
        return False
    
    # logvar
    if(logvar is None):
        print("logvar is None")
        return False
    if(logvar.shape[0] != batch_size):
        print("logvar shape is not batch_size")
        return False
    # check for nan values
    if(torch.isnan(logvar).any()):
        print("logvar has nan values")
        return False
    # check for inf values
    if(torch.isinf(logvar).any()):
        print("logvar has inf values")
        return False
    # check for null values
    if(logvar.isnull().values.any()):
        print("logvar has null values")
        return False
    
    return True   
      

In [169]:
def train(vae, train_loader, num_epochs=10, learning_rate=1e-3, beta=1.0, device='cuda', with_checker=False):
    # Move the model to the specified device (cuda or cpu)
    vae.to(device)
    
    # Define the optimizer
    optimizer = optim.Adam(vae.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0.0
        recon_loss = 0.0
        kl_loss = 0.0
        
        for batch_idx, data in enumerate(train_loader):
            # Move the batch to the specified device
            data = data.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass through the VAE
            recon_data, mu, logvar = vae.forward(data)
            
            # Check the model return values
            if(with_checker):
                if(not checker(recon_data, mu, logvar)):
                    return
            
            # Calculate the VAE loss
            loss, recon_loss_batch, kl_loss_batch = loss_function(recon_data, data, mu, logvar, beta=beta)
            
            # Backward pass and optimization step
            loss.backward()
            optimizer.step()
            
            # Update the running total of losses
            total_loss += loss.item()
            recon_loss += recon_loss_batch.item()
            kl_loss += kl_loss_batch.item()
            
            # Print logs every N batches (you can adjust this value)
            log_interval = 100
            if batch_idx % log_interval == 0 and batch_idx > 0:
                avg_loss = total_loss / log_interval
                avg_recon_loss = recon_loss / log_interval
                avg_kl_loss = kl_loss / log_interval
                
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], '
                      f'Avg. Loss: {avg_loss:.4f}, Avg. Recon Loss: {avg_recon_loss:.4f}, Avg. KL Loss: {avg_kl_loss:.4f}')
                
                total_loss = 0.0
                recon_loss = 0.0
                kl_loss = 0.0
                
        # Print epoch-level logs
        print(f'Epoch [{epoch+1}/{num_epochs}], Avg. Total Loss: {total_loss:.4f}')
        
    print('Training complete.')

# Create model and train it

In [170]:
# create model and optimizer
# Get the input dimension from the training dataset
input_dim =  len(train_dataset[1])
print(f"input_dim : {input_dim}")
# Get the latent dimension from the input dimension
latent_dim = input_dim // 2
model = VAE(input_dim, latent_dim).to(device)

input_dim : 139


In [171]:
print(f"input_dim: {input_dim} and latent_dim: {latent_dim}")
print(f"model: {model}")

input_dim: 139 and latent_dim: 69
model: VAE(
  (encoder): Encoder(
    (fc1): Linear(in_features=139, out_features=256, bias=True)
    (fc2): Linear(in_features=256, out_features=128, bias=True)
    (fc_mean): Linear(in_features=128, out_features=69, bias=True)
    (fc_logvar): Linear(in_features=128, out_features=69, bias=True)
  )
  (decoder): Decoder(
    (fc1): Linear(in_features=69, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=256, bias=True)
    (fc_out): Linear(in_features=256, out_features=139, bias=True)
  )
)


In [172]:
# train model
train(model, train_loader, num_epochs=1, device=device, with_checker=False)

forward mean : tensor([[ 0.0193,  0.0156, -0.0095,  ...,  0.0531, -0.0202,  0.0627],
        [ 0.0225,  0.0098, -0.0196,  ...,  0.0667, -0.0080,  0.0576],
        [ 0.0133,  0.0223, -0.0038,  ...,  0.0908, -0.0010,  0.0624],
        ...,
        [ 0.0257,  0.0089, -0.0065,  ...,  0.0471,  0.0028,  0.0639],
        [-0.0072,  0.0446, -0.0192,  ...,  0.0764, -0.0215,  0.0494],
        [ 0.0184,  0.0247, -0.0380,  ...,  0.0545,  0.0099,  0.0720]],
       grad_fn=<AddmmBackward0>) and logvar : tensor([[-0.0083,  0.0380,  0.0481,  ...,  0.0751, -0.0223,  0.0024],
        [ 0.0349,  0.0403,  0.0624,  ...,  0.1108,  0.0102,  0.0086],
        [ 0.0388,  0.0545,  0.0289,  ...,  0.0705,  0.0121, -0.0189],
        ...,
        [ 0.0298,  0.0333,  0.0354,  ...,  0.0969,  0.0169,  0.0165],
        [ 0.0099,  0.0324,  0.0314,  ...,  0.0870,  0.0182,  0.0032],
        [ 0.0402,  0.0545,  0.0195,  ...,  0.0903,  0.0179,  0.0121]],
       grad_fn=<AddmmBackward0>)
forward mean : tensor([[ 0.0140,  0.01

# Generate a new data to test

In [173]:
def add_headers(headers, generated_data):
    # Add the headers to the generated data
    return pd.DataFrame(generated_data, columns=headers)

In [174]:
# Generate a new sample
def generate_sample(model, headers):
    sample = torch.randn(1, model.latent_size)
    generated_data = model.decoder.forward(sample).detach().numpy()
    return generated_data #add_headers(headers, generated_data)

In [175]:
new_sample = generate_sample(model, df_treated.columns)

print(new_sample.shape)
print(df_treated.shape)

(1, 139)
(1000, 30)


In [176]:

# Print or use the generated supplier data as needed
print("Generated Supplier Data:")
for i in range(5):
    print(generate_sample(model, df_treated.columns))

# Save the model
timestamp = time.strftime("%Y%m%d-%H%M%S")
torch.save(model.state_dict(), f'model_{timestamp}.pt')

# Generate a new sample
new_sample = generate_sample(model, df_treated.columns)

Generated Supplier Data:
[[0.5342029  0.5519393  0.5497189  0.20515414 0.48245504 0.22607756
  0.30071917 0.543737   0.3065052  0.26441836 0.29737964 0.28223765
  0.22280303 0.20591065 0.23592944 0.16978751 0.17597856 0.14744556
  0.16255693 0.15990284 0.15269719 0.20595558 0.16248208 0.22454111
  0.16787572 0.15482032 0.19754596 0.17155553 0.1989674  0.1466004
  0.18269566 0.14064354 0.1833571  0.20478648 0.14259851 0.22759137
  0.25455472 0.18321115 0.207167   0.20940945 0.18008578 0.19872123
  0.1546141  0.11690183 0.14975587 0.16391425 0.11213035 0.21916316
  0.25276402 0.17762621 0.15544    0.17156601 0.17446774 0.21797039
  0.17553568 0.18889941 0.1929721  0.16285451 0.1537066  0.1361907
  0.13263907 0.1694097  0.19437777 0.18175863 0.15365958 0.1731075
  0.14597806 0.18833655 0.12817836 0.1696679  0.13386744 0.17309639
  0.1268606  0.15723407 0.21049902 0.14503185 0.1425296  0.25317353
  0.17632751 0.12248074 0.17057529 0.19498128 0.14029211 0.22199364
  0.1538278  0.1781798  0.

# PostProcess the data

In [177]:
def reverse_transform_categorical(categorical_colums, classes):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = categorical_colums.copy()

    # Apply a one-hot encoding to all columns of the tensor
    for column in categorical_colums.columns:

        # Convert the indices to a PyTorch tensor
        tensor_data = torch.tensor(categorical_colums[column])

        # Apply one-hot encoding using torch.nn.functional.one_hot
        one_hot_encoded = F.one_hot(tensor_data)

        # Convert the one-hot encoding tensor to a NumPy array
        one_hot_array = one_hot_encoded.numpy()

        # Convert the one-hot encoding array to a list of indices
        indices = []
        for row in one_hot_array:
            indices.append(np.argmax(row))

        # Convert the indices to a list of strings
        strings = []
        for index in indices:
            strings.append(classes[column][index])

        # Add the one-hot encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = strings

    return transformed_dataset

In [178]:
def reverse_transform_numerical(numerical_columns):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = numerical_columns.copy()

    # Apply a min-max normalization to all columns of the tensor
    for column in numerical_columns.columns:
        # Convert the column to a PyTorch tensor
        tensor_data = torch.tensor(numerical_columns[column], dtype=torch.float32)

        # Range of values in the column
        min_value = torch.min(tensor_data)
        max_value = torch.max(tensor_data)

        # Normalize the values in the column between 0 and 1en
        normalized = (tensor_data - min_value) / (max_value - min_value)

        # Apply min-max normalization using torch.nn.functional.normalize
        #normalized = F.normalize(tensor_data)

        # Convert the normalized tensor to a NumPy array
        normalized_array = normalized.numpy()

        # Add the normalized array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = list(normalized_array)

    return transformed_dataset

In [179]:
def reverse_transform_text(text_columns):
    # Prepare embeddings model
    embedding_dim = 100
    glove = GloVe(name='6B', dim=embedding_dim)
    tokenizer = get_tokenizer('basic_english')

    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = text_columns.copy()

    # Apply a word embedding encoding to all columns of the tensor using torchtext
    for column in text_columns.columns:
        # Convert the column to a string list
        texts = text_columns[column].astype(str).tolist()

        # Convert the text to a list of tokens
        tokens = [tokenizer(t) for t in texts]

        # Convert the tokens to a list of indices
        encoded_data = []
        for token in tokens:
            token_encoded = []
            for word in token:
                if word in glove.stoi:
                    token_encoded.append(glove.stoi[word])
                else:
                    token_encoded.append(0)
            encoded_data.append(token_encoded)

        # Convert the indices to a PyTorch tensor
        if len(encoded_data) <= 0:
            continue

        non_empty_sequences = [torch.tensor(seq) for seq in encoded_data if len(seq) > 0]

        # Pad the sequences to the same length
        padded_sequences = pad_sequence(non_empty_sequences)

        # Add the encoded array to the transformed dataset
        transformed_dataset = transformed_dataset.drop(columns=[column])
        transformed_dataset[column] = padded_sequences.tolist() # TODO: fix size mismatch error
        
    return transformed_dataset

In [180]:
def reverse_transform_date(date_columns):
    # Create a copy of the dataset to avoid modifying the original
    transformed_dataset = date_columns.copy()

    for column in date_columns.columns:
        # Decompose the date into its components
        date = pd.to_datetime(date_columns[column])

        year = date.dt.year
        # Check if the inital column contain a year
        if not year.empty and not year.all() == 0:
            year = torch.tensor(year, dtype=torch.float32)
            transformed_dataset[column + "_year"] = year

        month = date.dt.month
        if not month.empty and not month.all() == 0:
            month = torch.tensor(month / 12, dtype=torch.float32)
            transformed_dataset[column + "_month"] = month

        day = date.dt.day
        if not day.empty and not day.all() == 0:
            day = torch.tensor(day / 31, dtype=torch.float32)
            transformed_dataset[column + "_day"] = day

        hour = date.dt.hour
        if not hour.empty and not hour.all() == 0:
            hour = torch.tensor(hour / 24, dtype=torch.float32)
            transformed_dataset[column + "_hour"] = hour

        minute = date.dt.minute
        if not minute.empty and not minute.all() == 0:
            minute = torch.tensor(minute / 60, dtype=torch.float32)
            transformed_dataset[column + "_minute"] = minute

        second = date.dt.second
        if not second.empty and not second.all() == 0:
            second = torch.tensor(second / 60, dtype=torch.float32)
            transformed_dataset[column + "_second"] = second
        

        # Drop the original date column
        transformed_dataset = transformed_dataset.drop(columns=[column])

    return transformed_dataset

In [181]:
def split_by_type(sample):
    # Separate the text columns from the other columns
    text_columns = ["invoice_code", "customer_name","customer_email","customer_address","customer_city","customer_state","customer_postal_code",
                    "customer_country","notes","created_by","updated_by","shipping_address","shipping_city","shipping_state",
                    "shipping_postal_code","shipping_country"]
    
    date_columns = {}
    # # Create sublist for all columns with the same prefix
    # for column in sample.columns:
    #     if not date_columns.__contains__(column.split("_")[0]):
    #         date_columns[column.split("_")[0]] = []
    #     date_columns[column.split("_")[0]].append(column)

    categorical_columns = ["payment_method","status","currency","payment_reference"]

    numerical_columns = ["invoice_number","subtotal","tax_rate","tax_amount","discount_rate","discount_amount","total","exchange_rate"]  

    # Check if there is a column not in one of the above lists
    # for column in sample.columns:
    #     if column not in text_columns and column not in date_columns and column not in categorical_columns and column not in numerical_columns:
    #         print("Column not in any list: " + column) 
            
    return sample[text_columns], sample[date_columns], sample[categorical_columns], sample[numerical_columns]

In [182]:
categorical_colums_result, numerical_columns_result, text_columns_result, date_columns_result = split_by_type(new_sample, categorical_columns_treated.columns, numerical_columns_treated.columns, text_columns_treated.columns, date_columns_treated.columns)


# Reverse the transformations applied to the categorical columns
categorical_columns_reversed = reverse_transform_categorical(categorical_colums_result, classes)

# Reverse the transformations applied to the numerical columns
numerical_columns_reversed = reverse_transform_numerical(numerical_columns_result)

# Reverse the transformations applied to the text columns
#text_columns_reversed = reverse_transform_text(text_columns_result)

# Reverse the transformations applied to the date columns
date_columns_reversed = reverse_transform_date(date_columns_result)

# Concatenate the reversed columns back into a single dataframe
#text_columns_reversed
df_reversed = pd.concat([categorical_columns_reversed, numerical_columns_reversed, date_columns_reversed], axis=1)

# Print the reversed dataframe
print(df_reversed)


TypeError: split_by_type() takes 1 positional argument but 5 were given