In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from aux_functions import split_data
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim
import torch.nn as nn
import torch
import tqdm as tqdm
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')

In [None]:
encoder = LabelEncoder()

# Fit and transform the 'concept:name' column
encoded_concept_names = encoder.fit_transform(df['concept:name'])
# Adjusting labels to start from 1 instead of 0
encoded_concept_names += 1

# Fit and transform the 'next_concept:name' column
encoded_next_concept_names = encoder.fit_transform(df['next_concept:name'])
# Adjusting labels to start from 1 instead of 0
encoded_next_concept_names += 1

# Assign the encoded values back to the DataFrame
df['int:concept:name'] = encoded_concept_names
df['int:next_concept:name'] = encoded_next_concept_names

In [None]:
df.head()

In [None]:
columns_to_keep = ['int:concept:name', 'case:concept:name', 'position','int:next_concept:name']
df = df[columns_to_keep]
df.head()

In [None]:
train, test = split_data(df, 0.2)

In [None]:
train.head()

In [None]:
# Generating all the possible prefixes and suffixes of different length in the train set
prefixes = []
suffixes = []
for case in train['case:concept:name'].unique():
    case_df = train[train['case:concept:name'] == case]
    for i in range(1, len(case_df)):
        prefixes.append(case_df.iloc[:i].values)
        suffixes.append(list(case_df.iloc[i:]['int:concept:name'].values))

In [None]:
# Generating all the possible prefixes and suffixes of different length in the test set
prefixes_test = []
suffixes_test = []
for case in test['case:concept:name'].unique():
    case_df = test[test['case:concept:name'] == case]
    for i in range(1, len(case_df)):
        prefixes_test.append(case_df.iloc[:i].values)
        suffixes_test.append(list(case_df.iloc[i:]['int:concept:name'].values))

In [None]:
# Add padding to the prefix nd suffix to make every sequence of the same length (the length of the longest sequence) TRAIN
# We add the padding before the sequence in prefixes and after the sequence in suffixes
max_length = df['case:concept:name'].value_counts().max()
padding_dim = len(prefixes[0][0])
for i in range(len(prefixes)):
    padding_prefix = np.zeros((max_length - len(prefixes[i]), padding_dim))
    padding_suffix = [0] * (max_length - len(suffixes[i]))
    prefixes[i] = np.concatenate([padding_prefix, prefixes[i]], axis=0)
    suffixes[i] = np.concatenate([suffixes[i], padding_suffix], axis=0)

In [None]:
# Add padding to the prefix nd suffix to make every sequence of the same length (the length of the longest sequence) TEST
# We add the padding before the sequence in prefixes and after the sequence in suffixes
max_length = df['case:concept:name'].value_counts().max()
padding_dim = len(prefixes_test[0][0])
for i in range(len(prefixes_test)):
    padding_prefix = np.zeros((max_length - len(prefixes_test[i]), padding_dim))
    padding_suffix = [0] * (max_length - len(suffixes_test[i]))
    prefixes_test[i] = np.concatenate([padding_prefix, prefixes_test[i]], axis=0)
    suffixes_test[i] = np.concatenate([suffixes_test[i], padding_suffix], axis=0)

In [None]:
class LSDM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSDM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n.squeeze(0))
        return out

In [None]:
# Define the dimensions
input_dim = len(prefixes[0][0])  
hidden_dim = 64  
output_dim = len(suffixes[0])  

In [None]:
# Initialize the model
model = LSDM(input_dim, hidden_dim, output_dim)

In [None]:
prefixes_array = np.array(prefixes, dtype=np.float32)

In [None]:
# Convert data to PyTorch tensors
prefixes_tensor = torch.tensor(prefixes, dtype=torch.float32)
suffixes_tensor = torch.tensor(suffixes, dtype=torch.float32)

In [None]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
num_epochs = 10  
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(prefixes_tensor)
    loss = criterion(outputs, suffixes_tensor)
    loss.backward()
    optimizer.step()
    
    # Update the progress bar
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')