# Semantic Parsing Final Project
Link to the paper: https://aclanthology.org/P16-1004.pdf

Read through the paper fully before starting the assignment!

In [2]:
import torch
import torch.nn as nn

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = "/content/drive/MyDrive/CSCI 1460, NLP/final project/"

Mounted at /content/drive


# Data Downloading
This cell obtains the pre-processed Jobs dataset (see the paper) that you will be using to train and evaluate your model. (Pre-processed meaning that argument identification, section 3.6, has already been done for you). You should only need to run this cell ***once***. Feel free to delete it after running. Create a folder in your Google Drive in which the code below will store the pre-processed data needed for this project. Modify `FILEPATH` above to direct to said folder. It should start with `drive/MyDrive/...`, feel free to take a look at previous assignments that use mounting Google Drive if you can't remember what it should look like. *Make sure the data path ends with a slash character ('/').* The below code will access the zip file containing the pre-processed Jobs dataset from the paper and extract the files into your folder! Feel free to take a look at the `train.txt` and `test.txt` files to see what the data looks like. :)

In [3]:
import requests
import io
import zipfile

# https://stackoverflow.com/questions/31126596/saving-response-from-requests-to-file
response = requests.get('http://dong.li/lang2logic/seq2seq_jobqueries.zip')
if response.status_code == 200:
  # https://stackoverflow.com/questions/3451111/unzipping-files-in-python
  with zipfile.ZipFile(io.BytesIO(response.content), "r") as zip_ref:
    zip_ref.extractall(FILEPATH)
  print("Extraction completed.")
else:
  print("Failed to download the zip file.")

Extraction completed.


# Data Pre-processing
The following code is defined for you! It extracts the queries (inputs to your Seq2Seq model) and logical forms (expected outputs) from the training and testing files. It also does important pre-processing such as padding the queries and logical forms and turns the words into vocab indices. **Look over and understand this code before you start the assignment!**

In [4]:
def extract_file(filename):
  """
  Extracts queries and corresponding logical forms from either
  train.txt or test.txt. (Feel free to take a look at the files themselves
  in your Drive!)

  Parameters
  ----------
  filename : str
      name of the file to extract from

  Returns
  ----------
  tuple[list[list[str]], list[list[str]]]
      a tuple of a list of queries and their corresponding logical forms
      each in the form of a list of string tokens
  """
  queries, logical_forms = [], []
  with open(FILEPATH + filename) as f:
    for line in f:
      line = line.strip() # remove new line character
      query, logical_form = line.split('\t')

      query = query.split(' ')[::-1] # reversed inputs are used the paper (section 4.2)
      logical_form = ["<s>"] + logical_form.split(' ') + ["</s>"]

      queries.append(query)
      logical_forms.append(logical_form)
  return queries, logical_forms

query_train, lf_train = extract_file('train.txt') # 500 instances
query_test, lf_test = extract_file('test.txt') # 140 instances

In [5]:
from collections import Counter

query_vocab = Counter()
for l in query_train:
  query_vocab.update(l)

query_word2idx = {}
for w, c in query_vocab.items():
  if c >= 2:
    query_word2idx[w] = len(query_word2idx)
query_word2idx['<UNK>'] = len(query_word2idx)
query_word2idx['<PAD>'] = len(query_word2idx)
query_idx2word = {i:word for word,i in query_word2idx.items()}

query_vocab = list(query_word2idx.keys())

lf_vocab = Counter()
for lf in lf_train:
  lf_vocab.update(lf)

lf_vocab['<UNK>'] = 0
lf_vocab['<PAD>'] = 0
lf_idx2word = {i:word for i, word in enumerate(lf_vocab.keys())}
lf_word2idx = {word:i for i, word in lf_idx2word.items()}

In [6]:
query_train_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in query_train]
query_test_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in query_test]

lf_train_tokens = [[lf_word2idx.get(w, lf_word2idx['<UNK>']) for w in l] for l in lf_train]
lf_test_tokens = [[lf_word2idx.get(w, lf_word2idx['<UNK>']) for w in l] for l in lf_test]

def pad(seq, max_len, pad_token_idx):
  """
  Pads a given sequence to the max length using the given padding token index

  Parameters
  ----------
  seq : list[int]
      sequence in the form of a list of vocab indices
  max_len : int
      length sequence should be padded to
  pad_token_idx
      vocabulary index of the padding token

  Returns
  ----------
  list[int]
      padded sequence
  """
  seq = seq[:max_len]
  padded_seq = seq + (max_len - len(seq)) * [pad_token_idx]
  return padded_seq

query_max_target_len = max([len(i) for i in query_train_tokens])
query_train_tokens = [pad(i, query_max_target_len, query_word2idx['<PAD>']) for i in query_train_tokens]
query_test_tokens = [pad(i, query_max_target_len, query_word2idx['<PAD>']) for i in query_test_tokens]

lf_max_target_len = int(max([len(i) for i in lf_train_tokens]) * 1.5)
lf_train_tokens = [pad(i, lf_max_target_len, lf_word2idx['<PAD>']) for i in lf_train_tokens]
lf_test_tokens = [pad(i, lf_max_target_len, lf_word2idx['<PAD>']) for i in lf_test_tokens]

# Data Loading
The following code creates a JobsDataset and DataLoaders to use with your implemented model. Take a look at the main function at the end of this stencil to see how they are used in context.

In [7]:
from torch.utils.data import Dataset, DataLoader, default_collate

class JobsDataset(Dataset):
  """Defines a Dataset object for the Jobs dataset to be used with Dataloader"""
  def __init__(self, queries, logical_forms):
    """
    Initializes a JobsDataset

    Parameters
    ----------
    queries : list[list[int]]
        a list of queries, which have been tokenized and padded, in the form
        of a list of vocab indices
    logical_forms : list[list[int]]
        a list of corresponding logical forms, which have been tokenized and
        padded, in the form of a list of vocab indices
    """
    self.queries = queries
    self.logical_forms = logical_forms

  def __len__(self) -> int:
    """
    Returns the amount of paired queries and logical forms in the dataset

    Returns
    ----------
    int
        length of the dataset
    """
    return len(self.queries)

  def __getitem__(self, idx: int) -> tuple[list[int], list[int]]:
    """
    Returns a paired query and logical form at the specified index

    Parameters
    ----------
    idx : int
        specified index of the dataset

    Returns
    ----------
    tuple[list[int], list[int]]
        paired query and logical form at the specified index, in the form of
        a list of vocab indices
    """
    return self.queries[idx], self.logical_forms[idx]

def build_datasets() -> tuple[JobsDataset, JobsDataset]:
  """
  Builds a train and a test dataset from the queries and logical forms
  train and test tokens

  Returns
  ----------
  tuple[JobsDataset, JobsDataset]
      a training and testing JobsDataset
  """
  jobs_train = JobsDataset(queries=query_train_tokens, logical_forms=lf_train_tokens)
  jobs_test = JobsDataset(queries=query_test_tokens, logical_forms=lf_test_tokens)
  return jobs_train, jobs_test

# def collate(batch : list[tuple[list[int], list[int]]]) -> tuple[torch.Tensor, torch.Tensor]:
#   """
#   Used as collate_fn when creating the Dataloaders from the dataset

#   Parameters
#   ----------
#   batch : list[tuple[list[int], list[int]]]
#       a list of outputs of __getitem__

#   Returns
#   ----------
#   tuple[torch.Tensor, torch.Tensor]
#       a batched set of input sequences and a batched set of target sequences
#   """
#   src, tgt = default_collate(batch)
#   return torch.stack(src), torch.stack(tgt)

def collate(batch: list[tuple[list[int], list[int]]]) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Used as collate_fn when creating the Dataloaders from the dataset

    Parameters
    ----------
    batch : list[tuple[list[int], list[int]]]
        a list of outputs of __getitem__

    Returns
    ----------
    tuple[torch.Tensor, torch.Tensor]
        a batched set of input sequences and a batched set of target sequences
    """
    src, tgt = zip(*batch)  # Unzips the batch into src and tgt lists
    src = torch.tensor(src, dtype=torch.long)  # Convert src to a tensor
    tgt = torch.tensor(tgt, dtype=torch.long)  # Convert tgt to a tensor
    return src, tgt


def build_dataloaders(dataset_train: JobsDataset, dataset_test: JobsDataset,
                      train_batch_size: int) -> tuple[DataLoader, DataLoader]:
  """
  Used as collate_fn when creating the Dataloaders from the dataset, batching
  the training data according to the inputted batch size and batching the
  testing data with a batch size of 1

  Parameters
  ----------
  dataset_train : JobsDataset
      training dataset
  dataset_test : JobsDataset
      testing dataset
  train_batch_size : int
      batch size to be used during training

  Returns
  ----------
  tuple[DataLoader, DataLoader]
      a training and testing DataLoader
  """
  dataloader_train = DataLoader(dataset_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate)
  dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate)
  return dataloader_train, dataloader_test

# TODO: Define your model here!

In [9]:
# QUERY_VOCAB_LEN = len(query_vocab)
# LF_VOCAB_LEN = len(lf_vocab)

# def create_model():
#   """
#   Returns your model!

#   Returns
#   ----------
#   ???
#       your model!
#   """
#   pass

#-------------------------------------------------------------------------------
# the above is the stencil code and below is my implementation
#-------------------------------------------------------------------------------

import torch
import torch.nn as nn
import torch.optim as optim
import math

# initialize hyperparameters randomly:
EMBED_SIZE = 128 # Dimensionality of word embeddings
HIDDEN_SIZE = 256   # Number of units in the LSTM hidden layers
NUM_LAYERS = 2 # Number of LSTM layers, or number of encoders/decoders

# Given vocab sizes from the pre-processing stencil code
QUERY_VOCAB_LEN = len(query_vocab)
LF_VOCAB_LEN = len(lf_vocab)

# Special indices (already extracted)
LF_SOS_INDEX = lf_word2idx['<s>'] # Start-of-sequence token index for logical form
LF_EOS_INDEX = lf_word2idx['</s>'] # End-of-sequence token index for logical form
LF_PAD_INDEX = lf_word2idx['<PAD>']  # Padding token index for logical form


class Encoder(nn.Module):
    """
    Encoder for the sequence-to-sequence model, implemented with multiple LSTM layers.
    Responsible for encoding the input sequence into a fixed-size context representation.
    """
    def __init__(self, input_size, embed_size, hidden_size, pad_idx, num_layers=3):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=False)

    def forward(self, src):
        """
        Forward pass for the encoder.
        src: Tensor of shape (batch, seq_len) representing input sequences.
        return: LSTM outputs and hidden/cell states.
        """
        # Convert input tokens to dense vectors
        embedded = self.embedding(src)
        # outputs: Hidden states for all timesteps
        # h, c: Final hidden and cell states from the top LSTM layer
        outputs, (h, c) = self.lstm(embedded)

        return outputs, (h, c)


class Attention(nn.Module):
    """
    Attention mechanism for computing attention weights and context vectors.

    """
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, encoder_outputs, decoder_hidden_top):
        """
        Compute the attention context vector and weights.
        encoder_outputs: Tensor of shape (batch, src_len, hidden_size), encoder outputs for all timesteps.
        decoder_hidden_top: Tensor of shape (batch, hidden_size), decoder's top-layer hidden state.
        return: Context vector and attention weights.
        """
        # encoder_outputs: (batch, src_len, hidden_size)
        # decoder_hidden_top: (batch, hidden_size) - top layer hidden state of decoder

        # Add a dimension for batch matrix multiplication (batch, 1, hidden_size)
        decoder_hidden_top = decoder_hidden_top.unsqueeze(1)
        # Compute raw attention scores
        attention_scores = torch.bmm(encoder_outputs, decoder_hidden_top.transpose(1,2)).squeeze(-1)

        # Softmax and normalize the socres into probabilities
        attention_weights = torch.softmax(attention_scores, dim=1)

        # Compute context vector as weighted sum of encoder_outputs
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)

        return context, attention_weights


class Decoder(nn.Module):
    """
    Decoder with multiple LSTM layers and attention mechanism.
    Generates the output sequence token-by-token.

    """
    def __init__(self, output_size, embed_size, hidden_size, pad_idx, num_layers=3):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers=num_layers,
                            batch_first=True)
        # Attention mechanism to focus on relevant encoder outputs
        self.attention = Attention()
        self.W1 = nn.Linear(hidden_size, hidden_size) # Fully connected layer to transform the decoder hidden state
        self.W2 = nn.Linear(hidden_size, hidden_size) # Fully connected layer to transform the attention context vector
        self.Wo = nn.Linear(hidden_size, output_size) # Output projection layer for generating token probabilities
        self.tanh = nn.Tanh()  # Activation function
        self.num_layers = num_layers # Number of LSTM layers in the decoder

    def forward(self, input_token, hidden, cell, encoder_outputs):
        """
        Forward pass for the decoder.
        input_token: Tensor of shape (batch), input token indices at the current timestep.
        hidden: Tensor of shape (num_layers, batch, hidden_size), decoder hidden states from the previous step.
        Tensor of shape (num_layers, batch, hidden_size), decoder cell states from the previous step.
        ncoder_outputs: Tensor of shape (batch, src_len, hidden_size), encoder outputs for the entire input sequence.
        return: Log probabilities of the next token, updated hidden and cell states, and attention weights.
        """

        # Embed input token
        embedded = self.embedding(input_token).unsqueeze(1)

        # Run one step of LSTM
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # output: (batch, 1, hidden_size)
        # hidden, cell: (num_layers, batch, hidden_size)

        # Use the top layer of hidden state for attention
        top_layer_hidden = hidden[-1]  # (batch, hidden_size)
        # Compute attention context vector and weights
        context, attn_weights = self.attention(encoder_outputs, top_layer_hidden)

        # h_t^{att} = tanh(W1 h_t^{L} + W2 c^t)
        # Compute attention-modified hidden state
        h_att = self.tanh(self.W1(top_layer_hidden) + self.W2(context)) # (batch, hidden_size)

        # Project to output vocabulary size
        logits = self.Wo(h_att)

        # Convert logits to log probabilities
        # used log_softmax for torch.nn.NLLLoss when evaluating the model's performance,
        # but this loss, torch.nn.NLLLoss, is not printed in the final submission
        log_probs = torch.log_softmax(logits, dim=-1)

        return log_probs, hidden, cell, attn_weights


class Seq2Seq(nn.Module):
    """
    Sequence-to-sequence model that stacks together the encoder, decoder, and attention mechanism.
    Handles the end-to-end transformation of input sequences to output sequences.
    """
    def __init__(self,
                 input_size, output_size,
                 embed_size, hidden_size,
                 src_pad_idx, trg_pad_idx,
                 num_layers=3):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_size, embed_size, hidden_size, src_pad_idx, num_layers=num_layers)
        self.decoder = Decoder(output_size, embed_size, hidden_size, trg_pad_idx, num_layers=num_layers)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, src, trg=None, teacher_forcing=True):
        """
        Forward pass for the sequence-to-sequence model.
        src: Tensor of shape (batch, src_len), input sequences.
        trg: Tensor of shape (batch, trg_len), target sequences for teacher forcing.
        teacher_forcing: Boolean, whether to use teacher forcing during training.
        Log probabilities of output sequence tokens.
        """

        # If a target is provided, extract its sequence length.
        # if not, raise an error because we cannot use teacher forcing when the target is not provided
        # just a sanity check during my debugging process
        batch_size, src_len = src.shape
        if trg is not None:
            trg_len = trg.shape[1]
        else:
            teacher_forcing = False
            raise ValueError("No target provided for forward pass in training mode.")

        # # Encode the input sequence
        encoder_outputs, (h, c) = self.encoder(src)

        # Initialize decoder hidden and cell
        dec_hidden = h
        dec_cell = c

        # The first input token to the decoder is <s>, the start token
        input_token = trg[:,0]

        # Collect log probabilities for all timesteps
        log_probs_seq = []
        for t in range(1, trg_len):
            # Decode one step
            log_probs, dec_hidden, dec_cell, attn_weights = self.decoder(input_token, dec_hidden, dec_cell, encoder_outputs)
            log_probs_seq.append(log_probs.unsqueeze(1)) # Append log probabilities for the current timestep

            # Next input token
            if teacher_forcing:
                input_token = trg[:,t] # Use the next token from the ground truth
            else:
                top1 = log_probs.argmax(1) # Use the token with the highest probability as the next input
                input_token = top1

        # Combine log probabilities for all timesteps (batch, trg_len-1, output_size)
        log_probs_seq = torch.cat(log_probs_seq, dim=1) # (batch, trg_len-1, output_size)
        return log_probs_seq


def create_model():
    """
    Create and return an instance of the Seq2Seq model.
    return: A Seq2Seq model ready for training and testing.
    """
    model = Seq2Seq(
        input_size=QUERY_VOCAB_LEN,
        output_size=LF_VOCAB_LEN,
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        src_pad_idx=query_word2idx['<PAD>'],
        trg_pad_idx=LF_PAD_INDEX,
        num_layers=3
    )
    return model


# TODO: Training and testing loops

In [10]:
LF_SOS_INDEX = lf_word2idx['<s>']
LF_EOS_INDEX = lf_word2idx['</s>']
LF_PAD_INDEX = lf_word2idx['<PAD>']

In [11]:
# def train(model: nn.Module, train_dataloader: DataLoader, num_epochs: int=5,
#           device: str="cuda") -> nn.Module:
#   """
#   Trains your model!

#   Parameters
#   ----------
#   model : nn.Module
#       your model!
#   train_dataloader : DataLoader
#       a dataloader of the training data from build_dataloaders
#   num_epochs : int
#       number of epochs to train for
#   device : str
#       device that the model is running on

#   Returns
#   ----------
#   ???
#       your trained model
#   """
#   pass

#-------------------------------------------------------------------------------
# the above is the stencil code and below is my implementation
#-------------------------------------------------------------------------------



def train(model: nn.Module, train_dataloader: DataLoader, num_epochs: int=5,
          device: str="cuda") -> nn.Module:
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.NLLLoss(ignore_index=LF_PAD_INDEX, reduction='mean')


    model.train()

    # Iterate through each epoch
    for epoch in range(num_epochs):
        total_loss = 0 # Accumulator to track the total loss for the epoch

        # Loop through each batch in the training DataLoader.
        for batch_idx, (src, trg) in enumerate(train_dataloader):
            # Move source (src) and target (trg) tensors to the specified device.
            src = src.to(device)
            trg = trg.to(device)
            optimizer.zero_grad() # Zero the gradients from the previous batch to prevent accumulation.

            # Perform a forward pass with the model.
            # Teacher forcing is used during training (trg is provided).
            # It represents log probabilities for the output vocabulary at each timestep.
            log_probs_seq = model(src, trg, teacher_forcing=True)

            # Align the target sequence for loss calculation.
            # trg[:, 1:] represents the target sequence shifted by one (ignoring <s> at the start).
            # Reshape both log_probs_seq and trg[:, 1:] to match for loss computation.
            loss = criterion(
                log_probs_seq.reshape(-1, LF_VOCAB_LEN),
                trg[:,1:].reshape(-1)
            )

            loss.backward()  # Backpropagate the loss to compute gradients.
            optimizer.step() # Update the model parameters using the computed gradients.

            # Add the batch loss to the total loss for this epoch.
            total_loss += loss.item()
        # Calculate the average loss for the epoch.
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    return model


In [12]:
# def evaluate(model: nn.Module, dataloader: DataLoader, device: str="cuda") -> tuple[int, int]:
#   """
#   Evaluates your model!

#   Parameters
#   ----------
#   model : nn.Module
#       your model!
#   dataloader : DataLoader
#       a dataloader of the testing data from build_dataloaders
#   device : str
#       device that the model is running on

#   Returns
#   ----------
#   tuple[int, int]
#       per-token accuracy and exact_match accuracy
#   """
#   pass

#-------------------------------------------------------------------------------
# the above is the stencil code and below is my implementation
#-------------------------------------------------------------------------------


def evaluate(model: nn.Module, dataloader: DataLoader, device: str="cuda") -> tuple[float, float]:

    model.eval()

    # Initialize counters for evaluation metrics
    total_tokens = 0 # Total number of valid tokens in the dataset (excluding padding)
    correct_tokens = 0 # Count of correctly predicted tokens
    exact_matches = 0 # Count of sequences that match the target exactly

    # Disable gradient computation to save memory and computation during evaluation
    with torch.no_grad():
        # Loop through each batch in the dataloader
        for src, trg in dataloader:
            # src: (1, src_len) - Input sequence
            # trg: (1, trg_len) - Target sequence
            src = src.to(device)
            trg = trg.to(device)
            batch_size, trg_len = trg.shape # Ensure that each batch contains only one sample (batch size of 1)
            assert batch_size == 1, "Test batch size should be 1"

            # Encode the input sequence using the encoder
            encoder_outputs, (h, c) = model.encoder(src)
            # encoder_outputs: Hidden states from the encoder (batch, src_len, hidden_size)
            # h, c: Final hidden and cell states from the encoder (num_layers, batch, hidden_size)
            dec_hidden = h
            dec_cell = c

            # Start decoding
            input_token = trg[:,0] # <s>
            generated = [input_token.item()]
            # Decode one step at a time for trg_len - 1 steps (excluding the start token)
            for t in range(1, trg_len):
                log_probs, dec_hidden, dec_cell, _ = model.decoder(input_token, dec_hidden, dec_cell, encoder_outputs)
                # Select the token with the highest probability (greedy decoding)
                top1 = log_probs.argmax(1)
                generated.append(top1.item())
                # Move to the next input token for the decoder
                input_token = top1

                # Termination condition:
                # The code is set up to generate the full sequence, even if an <EOS> token is encountered.


            # Post-decoding: Evaluate the generated sequence
            # Ignore the <s> token when comparing with the target sequence
            pred_seq = generated[1:]  # ignoring the <s> we started with
            gold_seq = trg[0,1:].tolist() # ignore the first token <s>

            # Compute per-token accuracy
            for p, g in zip(pred_seq, gold_seq):
                if g != LF_PAD_INDEX: # don't count padding
                    total_tokens += 1
                    if p == g:
                        correct_tokens += 1

            # Compute exact match accuracy
            # Remove padding tokens from the target sequence
            gold_no_pad = [x for x in gold_seq if x != LF_PAD_INDEX]
            # Trim the predicted sequence to the length of the non-padded target sequence
            pred_no_pad = pred_seq[:len(gold_no_pad)]
            if pred_no_pad == gold_no_pad:
                exact_matches += 1

    per_token_accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    exact_match_accuracy = exact_matches / len(dataloader)
    return per_token_accuracy, exact_match_accuracy





# Run this!

In [13]:
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    jobs_train, jobs_test = build_datasets()
    dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size=20)
    model = create_model()
    model = train(model, dataloader_train, num_epochs=20, device=device)
    test_per_token_accuracy, test_exact_match_accuracy = evaluate(model, dataloader_test, device=device)
    print(f'Test Per-token Accuracy: {test_per_token_accuracy}')
    print(f'Test Exact-match Accuracy: {test_exact_match_accuracy}')

    # # Adjusted to unpack three values
    # test_per_token_accuracy, test_exact_match_accuracy, test_loss = evaluate(model, dataloader_test, device=device)

    # print(f'Test Per-token Accuracy: {test_per_token_accuracy}')
    # print(f'Test Exact-match Accuracy: {test_exact_match_accuracy}')
    # print(f'Test Loss: {test_loss:.4f}')  # Optionally print the loss

main()

Epoch 1/20, Loss: 2.7042
Epoch 2/20, Loss: 2.0531
Epoch 3/20, Loss: 1.1222
Epoch 4/20, Loss: 0.8024
Epoch 5/20, Loss: 0.6956
Epoch 6/20, Loss: 0.6201
Epoch 7/20, Loss: 0.5484
Epoch 8/20, Loss: 0.4807
Epoch 9/20, Loss: 0.4186
Epoch 10/20, Loss: 0.3559
Epoch 11/20, Loss: 0.3066
Epoch 12/20, Loss: 0.2609
Epoch 13/20, Loss: 0.2121
Epoch 14/20, Loss: 0.1859
Epoch 15/20, Loss: 0.1666
Epoch 16/20, Loss: 0.1392
Epoch 17/20, Loss: 0.1118
Epoch 18/20, Loss: 0.0982
Epoch 19/20, Loss: 0.0852
Epoch 20/20, Loss: 0.0770
Test Per-token Accuracy: 0.8757281553398059
Test Exact-match Accuracy: 0.7428571428571429
