In [None]:
# Import necessary libraries
import os
import time
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from jiwer import wer, cer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pickle
import re

In [None]:
# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
# Check if GPU is available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')

In [None]:
# Create output directory
OUTPUT_DIR = './output_1/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
# Paths for saving/loading
DATASET_PATH = './exportStatements.xlsx'
VOCAB_PATH = os.path.join(OUTPUT_DIR, 'word_vocab.pkl')
PREPROCESSED_DATA_PATH = os.path.join(OUTPUT_DIR, 'preprocessed_data_word.pkl')
BEST_MODEL_PATH = os.path.join(OUTPUT_DIR, 'best_lstm_model_word.pt')
BEST_CER_MODEL_PATH = os.path.join(OUTPUT_DIR, 'best_lstm_model_cer.pt')
LOSS_PLOT_PATH = os.path.join(OUTPUT_DIR, 'lstm_loss_plot_char.png')
WER_PLOT_PATH = os.path.join(OUTPUT_DIR, 'wer_plot_char.png')
CER_PLOT_PATH = os.path.join(OUTPUT_DIR, 'cer_plot_char.png')

In [None]:
# Load the dataset
df = pd.read_excel(DATASET_PATH)

In [None]:
# Check for missing values in 'inFormalForm' and 'FormalForm'
print("Missing values in 'inFormalForm':", df['inFormalForm'].isnull().sum())
print("Missing values in 'FormalForm':", df['FormalForm'].isnull().sum())

# Drop rows with missing values in 'inFormalForm' and 'FormalForm'
initial_length = len(df)
df = df.dropna(subset=['inFormalForm', 'FormalForm']).reset_index(drop=True)
final_length = len(df)

df['inFormalForm'] = df['inFormalForm'].astype(str)
df['FormalForm'] = df['FormalForm'].astype(str)

print(f"Dropped {initial_length - final_length} rows due to missing values.")

In [None]:
# Split data into training, validation, and test sets (80%, 10%, 10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)

In [None]:
# Build word vocabulary from training data
if not os.path.exists(VOCAB_PATH):
    print('Building word vocabulary...')
    from collections import Counter

    # Simple tokenizer function
    def tokenize(text):
        # Split on whitespace and punctuation
        tokens = re.findall(r'\w+|[^\s\w]+', text)
        return tokens

    # Collect all words from the training data
    all_words = []
    for text in train_df['inFormalForm'].tolist() + train_df['FormalForm'].tolist():
        tokens = tokenize(text)
        all_words.extend(tokens)

    # Build vocabulary
    word_counts = Counter(all_words)
    words = sorted(word_counts.keys())

    # Add special tokens
    special_tokens = ['<pad>', '<unk>', '<s>', '</s>']
    word2idx = {word: idx + len(special_tokens) for idx, word in enumerate(words)}
    for idx, token in enumerate(special_tokens):
        word2idx[token] = idx
    idx2word = {idx: word for word, idx in word2idx.items()}

    # Save vocabulary
    with open(VOCAB_PATH, 'wb') as f:
        pickle.dump({'word2idx': word2idx, 'idx2word': idx2word}, f)
    print('Word vocabulary built and saved.')
else:
    print('Loading existing word vocabulary...')
    with open(VOCAB_PATH, 'rb') as f:
        vocab = pickle.load(f)
        word2idx = vocab['word2idx']
        idx2word = vocab['idx2word']

In [None]:
# Special token IDs
PAD_IDX = word2idx['<pad>']
UNK_IDX = word2idx['<unk>']
BOS_IDX = word2idx['<s>']
EOS_IDX = word2idx['</s>']

PAD_IDX, UNK_IDX, BOS_IDX, EOS_IDX

In [None]:
# Maximum sequence length (based on dataset)
def get_max_len(df_list):
    max_len = 0
    for df in df_list:
        lengths_src = df['inFormalForm'].apply(lambda x: len(tokenize(x)) + 2)  # +2 for BOS and EOS
        lengths_trg = df['FormalForm'].apply(lambda x: len(tokenize(x)) + 2)
        max_len = max(max_len, lengths_src.max(), lengths_trg.max())
    return max_len

MAX_LEN = get_max_len([train_df, val_df, test_df])
MAX_LEN

In [None]:
# Check if preprocessed data exists
if not os.path.exists(PREPROCESSED_DATA_PATH):
    print('Preprocessing data...')
    # Preprocess and tokenize all sentences
    def preprocess_data(df, word2idx, max_len=MAX_LEN):
        src_texts = df['inFormalForm'].tolist()
        trg_texts = df['FormalForm'].tolist()
        src_sequences = []
        trg_sequences = []
        for src, trg in zip(src_texts, trg_texts):
            src_tokens = tokenize(src)
            trg_tokens = tokenize(trg)
            src_ids = [BOS_IDX] + [word2idx.get(w, UNK_IDX) for w in src_tokens] + [EOS_IDX]
            trg_ids = [BOS_IDX] + [word2idx.get(w, UNK_IDX) for w in trg_tokens] + [EOS_IDX]
            # Pad or truncate sequences
            src_ids = src_ids[:max_len] + [PAD_IDX] * max(0, max_len - len(src_ids))
            trg_ids = trg_ids[:max_len] + [PAD_IDX] * max(0, max_len - len(trg_ids))
            src_sequences.append(src_ids)
            trg_sequences.append(trg_ids)
        return src_sequences, trg_sequences
    
    # Tokenize and preprocess data
    train_src, train_trg = preprocess_data(train_df, word2idx)
    val_src, val_trg = preprocess_data(val_df, word2idx)
    test_src, test_trg = preprocess_data(test_df, word2idx)

    # Save preprocessed data
    with open(PREPROCESSED_DATA_PATH, 'wb') as f:
        pickle.dump({
            'train_src': train_src,
            'train_trg': train_trg,
            'val_src': val_src,
            'val_trg': val_trg,
            'test_src': test_src,
            'test_trg': test_trg,
            'MAX_LEN': MAX_LEN
        }, f)
    print('Preprocessed data saved.')
else:
    print('Loading preprocessed data...')
    # Load preprocessed data
    with open(PREPROCESSED_DATA_PATH, 'rb') as f:
        data = pickle.load(f)
        train_src = data['train_src']
        train_trg = data['train_trg']
        val_src = data['val_src']
        val_trg = data['val_trg']
        test_src = data['test_src']
        test_trg = data['test_trg']
        MAX_LEN = data['MAX_LEN']

In [None]:
# Prepare datasets
class TranslationDataset(Dataset):
    def __init__(self, src_sequences, trg_sequences):
        self.src_sequences = src_sequences
        self.trg_sequences = trg_sequences

    def __len__(self):
        return len(self.src_sequences)

    def __getitem__(self, idx):
        src_ids = torch.tensor(self.src_sequences[idx], dtype=torch.long)
        trg_ids = torch.tensor(self.trg_sequences[idx], dtype=torch.long)
        return src_ids, trg_ids

In [None]:
# Create datasets and dataloaders
batch_size = 32  # Adjust as needed

train_dataset = TranslationDataset(train_src, train_trg)
val_dataset = TranslationDataset(val_src, val_trg)
test_dataset = TranslationDataset(test_src, test_trg)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# Define the LSTM-based Seq2Seq model (same as before)
class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, dropout, pad_idx):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.encoder = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.decoder = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # Embed source and target sequences
        embedded_src = self.dropout(self.embedding(src))
        embedded_trg = self.dropout(self.embedding(trg[:, :-1]))  # Remove last token for decoder input

        # Encode source sequence
        _, (hidden, cell) = self.encoder(embedded_src)

        # Decode target sequence
        outputs, _ = self.decoder(embedded_trg, (hidden, cell))
        predictions = self.fc_out(outputs)

        return predictions

    def predict(self, src, max_len=MAX_LEN):
        # Embed source sequence
        embedded_src = self.dropout(self.embedding(src))

        # Encode source sequence
        _, (hidden, cell) = self.encoder(embedded_src)

        batch_size = src.size(0)
        # Initialize target sequence with <s>
        inputs = torch.tensor([BOS_IDX] * batch_size, dtype=torch.long).unsqueeze(1).to(src.device)
        outputs = []
        # Keep track of finished sequences
        finished = torch.zeros(batch_size, dtype=torch.bool).to(src.device)
        for _ in range(max_len):
            embedded = self.dropout(self.embedding(inputs))
            output, (hidden, cell) = self.decoder(embedded, (hidden, cell))
            prediction = self.fc_out(output.squeeze(1))
            top1 = prediction.argmax(dim=1)  # Shape: [batch_size]
            outputs.append(top1.unsqueeze(1))
            inputs = top1.unsqueeze(1)

            # Update finished sequences
            eos_found = top1 == EOS_IDX
            finished = finished | eos_found  # Logical OR to update finished sequences
            if finished.all():
                break
        outputs = torch.cat(outputs, dim=1)  # Shape: [batch_size, seq_len]
        return outputs

In [None]:
# Initialize model parameters
VOCAB_SIZE = len(word2idx)
EMB_DIM = 256  # Same as before
HID_DIM = 512  # Same as before
N_LAYERS = 2   # Same as before
DROPOUT = 0.5  # Same as before

model = Seq2Seq(VOCAB_SIZE, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, PAD_IDX).to(DEVICE)

In [None]:
# Initialize optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
def evaluate_wer(model, dataloader, idx2word, max_batches=None):
    model.eval()
    cer_scores = []
    wer_scores = []
    batches_processed = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src = src.to(DEVICE)
            trg = trg.to(DEVICE)

            batch_size = src.size(0)
            outputs = model.predict(src, max_len=MAX_LEN)
            outputs = outputs.cpu().tolist()
            trg = trg.cpu().tolist()

            for i in range(batch_size):
                pred_ids = outputs[i]
                trg_ids = trg[i][1:]  # Remove <s>

               # Remove PAD and special tokens
                pred_ids = [idx for idx in pred_ids if idx not in [PAD_IDX, EOS_IDX, UNK_IDX]]
                trg_ids = [idx for idx in trg_ids if idx not in [PAD_IDX, EOS_IDX, UNK_IDX]]

                pred_sentence = ' '.join([idx2word.get(idx, '') for idx in pred_ids])
                trg_sentence = ' '.join([idx2word.get(idx, '') for idx in trg_ids])

                cer_score = cer(trg_sentence, pred_sentence)
                wer_score = wer(trg_sentence, pred_sentence)

                cer_scores.append(cer_score)
                wer_scores.append(wer_score)

            batches_processed += 1
            if max_batches and batches_processed >= max_batches:
                break

    avg_cer = np.mean(cer_scores)
    avg_wer = np.mean(wer_scores)
    return avg_cer, avg_wer

In [None]:
# Training loop with WER calculation (same as before)
N_EPOCHS = 100
CLIP = 1  # Enable gradient clipping
best_valid_loss = float('inf')
best_valid_cer = float('inf')
patience = 5
counter = 0

train_losses = []
valid_losses = []

train_wers = []
valid_wers = []
train_cers = []
valid_cers = []

for epoch in range(1, N_EPOCHS + 1):
    start_time = time.time()

    # Training
    model.train()
    epoch_train_loss = 0
    for src, trg in tqdm(train_loader, desc=f'Training Epoch {epoch}/{N_EPOCHS}'):
        src = src.to(DEVICE)
        trg = trg.to(DEVICE)

        optimizer.zero_grad()
        output = model(src, trg)

        # output: [batch_size, trg_len - 1, vocab_size]
        # trg: [batch_size, trg_len]

        # Reshape for loss computation
        output = output.reshape(-1, VOCAB_SIZE)
        trg = trg[:, 1:].reshape(-1)  # Remove first token (<s>) for target

        loss = criterion(output, trg)
        loss.backward()

        # Gradient clipping
        # torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)

        optimizer.step()

        epoch_train_loss += loss.item()

    epoch_train_loss /= len(train_loader)
    train_losses.append(epoch_train_loss)

    # Validation
    model.eval()
    epoch_valid_loss = 0
    with torch.no_grad():
        for src, trg in tqdm(val_loader, desc=f'Validation Epoch {epoch}/{N_EPOCHS}'):
            src = src.to(DEVICE)
            trg = trg.to(DEVICE)

            output = model(src, trg, teacher_forcing_ratio=0)

            output = output.reshape(-1, VOCAB_SIZE)
            trg = trg[:, 1:].reshape(-1)  # Remove first token (<s>) for target

            loss = criterion(output, trg)
            epoch_valid_loss += loss.item()

    epoch_valid_loss /= len(val_loader)
    valid_losses.append(epoch_valid_loss)
    
    valid_cer, valid_wer = evaluate_wer(model, val_loader, idx2word)
    valid_wers.append(valid_wer)
    valid_cers.append(valid_cer)

    train_subset_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    train_cer, train_wer = evaluate_wer(model, train_subset_loader, idx2word, max_batches=5)
    train_wers.append(train_wer)
    train_cers.append(train_cer)
    
    print(f'\tTrain Loss: {epoch_train_loss:.3f}')
    print(f'\tValid Loss: {epoch_valid_loss:.3f}')
    
    print(f'\tTrain WER: {train_wer:.4f}')
    print(f'\tValid WER: {valid_wer:.4f}')

    print(f'\tTrain CER: {train_cer:.4f}')
    print(f'\tValid CER: {valid_cer:.4f}')
    
    # Early stopping check
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f'Validation loss improved. Model saved to {BEST_MODEL_PATH}.')
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print('Early stopping triggered.')
            break

    if valid_cer < best_valid_cer:
        best_valid_cer = valid_cer
        torch.save(model.state_dict(), BEST_CER_MODEL_PATH)
        print(f'Validation CER improved. Model saved to {BEST_CER_MODEL_PATH}.')
            
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(int(end_time - start_time), 60)

    print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s')

In [None]:
# Plot training and validation loss (same as before)
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(valid_losses) + 1), valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.savefig(LOSS_PLOT_PATH)
plt.show()
print(f'Loss plot saved to {LOSS_PLOT_PATH}.')

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_wers) + 1), train_wers, label='Train WER')
plt.plot(range(1, len(valid_wers) + 1), valid_wers, label='Validation WER')
plt.xlabel('Epoch')
plt.ylabel('WER')
plt.legend()
plt.title('Training and Validation WER Over Epochs')
plt.savefig(WER_PLOT_PATH)
plt.show()
print(f'WER plot saved to {WER_PLOT_PATH}.')

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_cers) + 1), train_cers, label='Train CER')
plt.plot(range(1, len(valid_cers) + 1), valid_cers, label='Validation CER')
plt.xlabel('Epoch')
plt.ylabel('CER')
plt.legend()
plt.title('Training and Validation CER Over Epochs')
plt.savefig(CER_PLOT_PATH)
plt.show()
print(f'CER plot saved to {CER_PLOT_PATH}.')

In [None]:
# Function for inference
def translate_sentence(sentence, model, word2idx, idx2word, device, max_len=MAX_LEN):
    model.eval()
    tokens = tokenize(sentence)
    tokens = [BOS_IDX] + [word2idx.get(w, UNK_IDX) for w in tokens] + [EOS_IDX]
    tokens = tokens[:max_len]
    src_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model.predict(src_tensor, max_len)
    outputs = outputs.squeeze(0).tolist()
    # Remove BOS token
    outputs = outputs
    # Stop at EOS token
    if EOS_IDX in outputs:
        eos_index = outputs.index(EOS_IDX)
        outputs = outputs[:eos_index]
    translation = ' '.join([idx2word.get(idx, '') for idx in outputs if idx not in [PAD_IDX, BOS_IDX, EOS_IDX, UNK_IDX]])
    return translation

In [None]:
# Function to calculate CER and WER
def calculate_metrics(references, hypotheses):
    cer_scores = []
    wer_scores = []
    for ref, hyp in zip(references, hypotheses):
        cer_score = cer(ref, hyp)
        wer_score = wer(ref, hyp)
        cer_scores.append(cer_score)
        wer_scores.append(wer_score)
    avg_cer = np.mean(cer_scores)
    avg_wer = np.mean(wer_scores)
    return avg_cer, avg_wer

In [None]:
# Evaluate and save results
def evaluate_and_save(model, df, src_sequences, trg_sequences, word2idx, idx2word, file_name):
    model.eval()
    predictions = []
    cer_scores = []
    wer_scores = []

    for src_ids, trg_ids in tqdm(zip(src_sequences, trg_sequences), total=len(src_sequences), desc=f'Evaluating {file_name}'):
        src_sentence = ' '.join([idx2word.get(idx, '') for idx in src_ids if idx not in [BOS_IDX, EOS_IDX, PAD_IDX]])
        trg_sentence = ' '.join([idx2word.get(idx, '') for idx in trg_ids if idx not in [BOS_IDX, EOS_IDX, PAD_IDX]])

        pred_sentence = translate_sentence(src_sentence, model, word2idx, idx2word, DEVICE)
        predictions.append(pred_sentence)
        cer_score = cer(trg_sentence, pred_sentence)
        wer_score = wer(trg_sentence, pred_sentence)
        cer_scores.append(cer_score)
        wer_scores.append(wer_score)

    results_df = pd.DataFrame({
        'Source': df['inFormalForm'],
        'Target': df['FormalForm'],
        'Prediction': predictions,
        'CER': cer_scores,
        'WER': wer_scores
    })
    
    results_df = results_df.sort_values(by=['CER', 'WER'], ascending=[True, True])

    results_path = os.path.join(OUTPUT_DIR, file_name)
    results_df.to_csv(results_path, index=False)
    avg_cer = np.mean(cer_scores)
    avg_wer = np.mean(wer_scores)
    print(f'Results saved to {results_path}')
    print(f'Average CER: {avg_cer:.4f}')
    print(f'Average WER: {avg_wer:.4f}')
    return results_df

In [None]:
# Load the best model
model.load_state_dict(torch.load(BEST_MODEL_PATH))
print('Best model loaded.')

# Evaluate on training data
print('Evaluating on training data...')
train_results = evaluate_and_save(model, train_df, train_src, train_trg, word2idx, idx2word, 'train_results_word.csv')

# Evaluate on validation data
print('Evaluating on validation data...')
val_results = evaluate_and_save(model, val_df, val_src, val_trg, word2idx, idx2word, 'val_results_word.csv')

# Evaluate on test data
print('Evaluating on test data...')
test_results = evaluate_and_save(model, test_df, test_src, test_trg, word2idx, idx2word, 'test_results_word.csv')

In [None]:
# Load the best CER model
model.load_state_dict(torch.load(BEST_CER_MODEL_PATH))
print('Best CER model loaded.')

# Evaluate on training data
print('Evaluating on training data using best CER model...')
train_results = evaluate_and_save(model, train_df, train_src, train_trg, word2idx, idx2word, 'train_results_best_cer.csv')

# Evaluate on validation data
print('Evaluating on validation data using best CER model...')
val_results = evaluate_and_save(model, val_df, val_src, val_trg, word2idx, idx2word, 'val_results_best_cer.csv')

# Evaluate on test data using the best CER model
print('Evaluating on test data using best CER model...')
test_results = evaluate_and_save(model, test_df, test_src, test_trg, word2idx, idx2word, 'test_results_best_cer.csv')