# Semantic Parsing Final Project
Link to the paper: https://aclanthology.org/P16-1004.pdf

Read through the paper fully before starting the assignment!

In [5]:
import torch
import torch.nn as nn

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = "drive/MyDrive/data/"

Mounted at /content/drive


# Data Downloading
This cell obtains the pre-processed Jobs dataset (see the paper) that you will be using to train and evaluate your model. (Pre-processed meaning that argument identification, section 3.6, has already been done for you). You should only need to run this cell ***once***. Feel free to delete it after running. Create a folder in your Google Drive in which the code below will store the pre-processed data needed for this project. Modify `FILEPATH` above to direct to said folder. It should start with `drive/MyDrive/...`, feel free to take a look at previous assignments that use mounting Google Drive if you can't remember what it should look like. *Make sure the data path ends with a slash character ('/').* The below code will access the zip file containing the pre-processed Jobs dataset from the paper and extract the files into your folder! Feel free to take a look at the `train.txt` and `test.txt` files to see what the data looks like. :)

In [6]:
import requests
import io
import zipfile

# https://stackoverflow.com/questions/31126596/saving-response-from-requests-to-file
response = requests.get('http://dong.li/lang2logic/seq2seq_jobqueries.zip')
if response.status_code == 200:
  # https://stackoverflow.com/questions/3451111/unzipping-files-in-python
  with zipfile.ZipFile(io.BytesIO(response.content), "r") as zip_ref:
    zip_ref.extractall(FILEPATH)
  print("Extraction completed.")
else:
  print("Failed to download the zip file.")

Extraction completed.


# Data Pre-processing
The following code is defined for you! It extracts the queries (inputs to your Seq2Seq model) and logical forms (expected outputs) from the training and testing files. It also does important pre-processing such as padding the queries and logical forms and turns the words into vocab indices. **Look over and understand this code before you start the assignment!**

In [7]:
def extract_file(filename):
  """
  Extracts queries and corresponding logical forms from either
  train.txt or test.txt. (Feel free to take a look at the files themselves
  in your Drive!)

  Parameters
  ----------
  filename : str
      name of the file to extract from

  Returns
  ----------
  tuple[list[list[str]], list[list[str]]]
      a tuple of a list of queries and their corresponding logical forms
      each in the form of a list of string tokens
  """
  queries, logical_forms = [], []
  with open(FILEPATH + filename) as f:
    for line in f:
      line = line.strip() # remove new line character
      query, logical_form = line.split('\t')

      query = query.split(' ')[::-1] # reversed inputs are used the paper (section 4.2)
      logical_form = ["<s>"] + logical_form.split(' ') + ["</s>"]

      queries.append(query)
      logical_forms.append(logical_form)
  return queries, logical_forms

query_train, lf_train = extract_file('train.txt') # 500 instances
query_test, lf_test = extract_file('test.txt') # 140 instances

In [8]:
from collections import Counter

query_vocab = Counter()
for l in query_train:
  query_vocab.update(l)

query_word2idx = {}
for w, c in query_vocab.items():
  if c >= 2:
    query_word2idx[w] = len(query_word2idx)
query_word2idx['<UNK>'] = len(query_word2idx)
query_word2idx['<PAD>'] = len(query_word2idx)
query_idx2word = {i:word for word,i in query_word2idx.items()}

query_vocab = list(query_word2idx.keys())

lf_vocab = Counter()
for lf in lf_train:
  lf_vocab.update(lf)

lf_vocab['<UNK>'] = 0
lf_vocab['<PAD>'] = 0
lf_idx2word = {i:word for i, word in enumerate(lf_vocab.keys())}
lf_word2idx = {word:i for i, word in lf_idx2word.items()}

In [9]:
query_train_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in query_train]
query_test_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in query_test]

lf_train_tokens = [[lf_word2idx.get(w, lf_word2idx['<UNK>']) for w in l] for l in lf_train]
lf_test_tokens = [[lf_word2idx.get(w, lf_word2idx['<UNK>']) for w in l] for l in lf_test]

def pad(seq, max_len, pad_token_idx):
  """
  Pads a given sequence to the max length using the given padding token index

  Parameters
  ----------
  seq : list[int]
      sequence in the form of a list of vocab indices
  max_len : int
      length sequence should be padded to
  pad_token_idx
      vocabulary index of the padding token

  Returns
  ----------
  list[int]
      padded sequence
  """
  seq = seq[:max_len]
  padded_seq = seq + (max_len - len(seq)) * [pad_token_idx]
  return padded_seq

query_max_target_len = max([len(i) for i in query_train_tokens])
query_train_tokens = [pad(i, query_max_target_len, query_word2idx['<PAD>']) for i in query_train_tokens]
query_test_tokens = [pad(i, query_max_target_len, query_word2idx['<PAD>']) for i in query_test_tokens]

lf_max_target_len = int(max([len(i) for i in lf_train_tokens]) * 1.5)
lf_train_tokens = [pad(i, lf_max_target_len, lf_word2idx['<PAD>']) for i in lf_train_tokens]
lf_test_tokens = [pad(i, lf_max_target_len, lf_word2idx['<PAD>']) for i in lf_test_tokens]

# Data Loading
The following code creates a JobsDataset and DataLoaders to use with your implemented model. Take a look at the main function at the end of this stencil to see how they are used in context.

In [10]:
from torch.utils.data import Dataset, DataLoader, default_collate

class JobsDataset(Dataset):
  """Defines a Dataset object for the Jobs dataset to be used with Dataloader"""
  def __init__(self, queries, logical_forms):
    """
    Initializes a JobsDataset

    Parameters
    ----------
    queries : list[list[int]]
        a list of queries, which have been tokenized and padded, in the form
        of a list of vocab indices
    logical_forms : list[list[int]]
        a list of corresponding logical forms, which have been tokenized and
        padded, in the form of a list of vocab indices
    """
    self.queries = queries
    self.logical_forms = logical_forms

  def __len__(self) -> int:
    """
    Returns the amount of paired queries and logical forms in the dataset

    Returns
    ----------
    int
        length of the dataset
    """
    return len(self.queries)

  def __getitem__(self, idx: int) -> tuple[list[int], list[int]]:
    """
    Returns a paired query and logical form at the specified index

    Parameters
    ----------
    idx : int
        specified index of the dataset

    Returns
    ----------
    tuple[list[int], list[int]]
        paired query and logical form at the specified index, in the form of
        a list of vocab indices
    """
    return self.queries[idx], self.logical_forms[idx]

def build_datasets() -> tuple[JobsDataset, JobsDataset]:
  """
  Builds a train and a test dataset from the queries and logical forms
  train and test tokens

  Returns
  ----------
  tuple[JobsDataset, JobsDataset]
      a training and testing JobsDataset
  """
  jobs_train = JobsDataset(queries=query_train_tokens, logical_forms=lf_train_tokens)
  jobs_test = JobsDataset(queries=query_test_tokens, logical_forms=lf_test_tokens)
  return jobs_train, jobs_test

def collate(batch : list[tuple[list[int], list[int]]]) -> tuple[torch.Tensor, torch.Tensor]:
  """
  Used as collate_fn when creating the Dataloaders from the dataset

  Parameters
  ----------
  batch : list[tuple[list[int], list[int]]]
      a list of outputs of __getitem__

  Returns
  ----------
  tuple[torch.Tensor, torch.Tensor]
      a batched set of input sequences and a batched set of target sequences
  """
  src, tgt = default_collate(batch)
  return torch.stack(src), torch.stack(tgt)

def build_dataloaders(dataset_train: JobsDataset, dataset_test: JobsDataset,
                      train_batch_size: int) -> tuple[DataLoader, DataLoader]:
  """
  Used as collate_fn when creating the Dataloaders from the dataset, batching
  the training data according to the inputted batch size and batching the
  testing data with a batch size of 1

  Parameters
  ----------
  dataset_train : JobsDataset
      training dataset
  dataset_test : JobsDataset
      testing dataset
  train_batch_size : int
      batch size to be used during training

  Returns
  ----------
  tuple[DataLoader, DataLoader]
      a training and testing DataLoader
  """
  dataloader_train = DataLoader(dataset_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate)
  dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate)
  return dataloader_train, dataloader_test

# TODO: Define your model here!

In [11]:
QUERY_VOCAB_LEN = len(query_vocab)
LF_VOCAB_LEN = len(lf_vocab)

In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        # Converting each index in the vocabulary into a matrix of embeddings
        self.embedding = nn.Embedding(input_dim, emb_dim)

        # Creating a encoder lstm that takes inputs in embedding size and outputs in hidden dimension with a given number of layers
        self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)


    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, target_vocab_size, emb_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        # Attention related attributes
        self.weights1 = nn.Linear(hidden_dim, hidden_dim) # Learnable weights for attention
        self.weights2 = nn.Linear(hidden_dim, hidden_dim) # Learnable weights for attention
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim = 1)

        # Converting each index in the output vocabulary into a matrix of embeddings
        self.embedding = nn.Embedding(target_vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, target_vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, cell, encoder_outputs):
        embeddings = self.embedding(input)
        embedded = embeddings.unsqueeze(1)

        _, (new_hidden, decoder_cell) = self.lstm(embedded, (hidden, cell))

        # Attention Score calculations
        last_hidden = (new_hidden.squeeze(0)).unsqueeze(2) # [1, Batch Size, Hidden Size] -> [Batch Size, Hidden Size, 1]
        # Encoder outputs - [Batch Size, Seq Length, Hidden Size]
        values = torch.bmm(encoder_outputs, last_hidden) # [Batch Size, Seq Length, 1]
        attention_scores = self.softmax(values.squeeze(2)).unsqueeze(1) # [Batch Size, 1, Seq Length]

        # Finding the contexts at the time step
        context = torch.bmm(attention_scores, encoder_outputs) # [Batch Size, 1, Hidden Size]
        context = context.squeeze(1) # [Batch Size, Hidden Size]

        last_hidden = last_hidden.squeeze(2)
        w1 = self.weights1(context)
        w2 = self.weights2(last_hidden)

        hidden_attention_t = self.tanh(w1 + w2)

        outputs = self.fc_out(hidden_attention_t)
        probs = self.log_softmax(outputs)
        return probs, new_hidden, decoder_cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, tfr):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.tfr = tfr

    def forward(self, src, trg):
        trg_len = trg.shape[1]
        batch_size = trg.shape[0]
        trg_vocab_size = self.decoder.embedding.num_embeddings

        # Outputs of decoder will be filled in here
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[:, 0]  # Start token

        # Manually looping thorugh decoder to implement attention mechanism
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:,t,:] = output
            top1 = output.argmax(1)
            if torch.rand(1).item() < self.tfr:
              input = trg[:, t]
            else :
              input= top1

        return outputs


In [13]:
def create_model(emb_dim, hidden_dim, layers, drop_rate, tfr):
    """
    Returns your model!

    Returns
    ----------
    the model
    """

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    encoder = Encoder(QUERY_VOCAB_LEN, emb_dim, hidden_dim, layers, drop_rate)
    decoder = Decoder(LF_VOCAB_LEN, emb_dim, hidden_dim, layers, drop_rate)
    model = Seq2Seq(encoder, decoder, device, tfr).to(device)
    return model

# TODO: Training and testing loops

In [14]:
LF_SOS_INDEX = lf_word2idx['<s>']
LF_EOS_INDEX = lf_word2idx['</s>']
LF_PAD_INDEX = lf_word2idx['<PAD>']

In [96]:
def get_outputs(batch, model, device, optimizer):
    # getting the outputs of the model
    source, target = batch
    source = source.transpose(0, 1)
    target = target.transpose(0, 1)
    source, target = source.to(device), target.to(device)

    outputs = model(source, target)

    optimizer.zero_grad()

    outputs = outputs[:, 1:].reshape(-1, outputs.shape[-1])
    targets = target[:, 1:].reshape(-1)
    return (outputs, targets)


def train(model, train_dataloader, num_epochs, device, learning_rate, is_search):
  """
  Trains your model!

  Parameters
  ----------
  model : nn.Module
      your model!
  train_dataloader : DataLoader
      a dataloader of the training data from build_dataloaders
  num_epochs : int
      number of epochs to train for
  device : str
      device that the model is running on

  Returns
  ----------
  the model that has been trained
  """
  model.to(device)
  optimizer = torch.optim.AdamW(model.parameters(), lr=0.002)
  loss_fn = nn.NLLLoss()

  for epoch in range(num_epochs):
      # Training the model
      model.train()
      tloss = 0
      for batch in train_dataloader:
          # getting the outputs of the model accordingly
          outputs, targets = get_outputs(batch, model, device, optimizer)

          # Upodating the loss value accordingly based on NLL Loss
          loss = loss_fn(outputs, targets)
          tloss += loss.item()

          loss.backward()
          optimizer.step()

      print("Epoch", epoch + 1, "Loss", round(tloss / len(train_dataloader), 4))

  return model

In [83]:
def get_preds(batch, model, device):
    # Getting the ouputs of the model
    source, target = batch
    source = source.transpose(0, 1)
    target = target.transpose(0, 1)
    source, target = source.to(device), target.to(device)

    # getting the respective predictions
    outputs = model(source, target)
    preds = outputs.argmax(2)
    return (target, preds)


def evaluate(model: nn.Module, dataloader: DataLoader, device: str="cuda") -> tuple[int, int]:
    """
    Evaluates your model!

    Parameters
    ----------
    model : nn.Module
        your model!
    dataloader : DataLoader
        a dataloader of the testing data from build_dataloaders
    device : str
        device that the model is running on

    Returns
    ----------
    tuple[int, int]
        per-token accuracy and exact_match accuracy
    """
    model.to(device)
    model.eval()
    correct_tok = 0
    toks = 0
    correct_seq = 0
    seqs = 0

    with torch.no_grad():
        for batch in dataloader:
            # Getting the models predictions
            target, preds = get_preds(batch, model, device)

            for i in range(target.shape[0]):
                # Making sure we are not overcounting the accuracy of the paddings
                predictions = preds[i, :(target == LF_EOS_INDEX).float().argmax(dim=1)[i]+1]
                new_target = target[i, :(target == LF_EOS_INDEX).float().argmax(dim=1)[i]+1]

                # Calculating the frequencies of exact tokens and sequences
                c = (predictions == new_target).sum().item()
                correct_tok += c
                toks += new_target.shape[0]

                s = (predictions == new_target).all(dim=0).sum().item()
                correct_seq += s
                seqs += 1

    return correct_tok / toks, correct_seq / seqs

# Run this!

In [93]:
def run_model(embed_size, hidden_size, num_layers, dropout, num_epochs, learning_rate, batch_size, teacher_forcing_ratio, is_search=True):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    jobs_train, jobs_test = build_datasets()
    dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size=batch_size)
    model = create_model(embed_size, hidden_size, num_layers, dropout, teacher_forcing_ratio)
    model = train(model, dataloader_train, num_epochs=num_epochs, device=device, learning_rate=learning_rate, is_search=is_search)
    test_per_token_accuracy, test_exact_match_accuracy = evaluate(model, dataloader_test, device=device)
    if not is_search:
      print(f'Test Per-token Accuracy: {test_per_token_accuracy}')
      print(f'Test Exact-match Accuracy: {test_exact_match_accuracy}')
    return (test_per_token_accuracy, test_exact_match_accuracy)

In [145]:
import itertools

def hyperparameter_search(hyperparameter_grid):
    best_score = -float('inf')
    best_token_score = -float('inf')
    best_hyperparameters = {}

    # Making combinations of hyperparameters so that it is easy to iterate
    param_combinations = itertools.product(*hyperparameter_grid.values())
    param_names = list(hyperparameter_grid.keys())

    # Running through each of the hyperparameters
    for combination in param_combinations:
        hyperparameters = dict(zip(param_names, combination))
        token, exact = run_model(**hyperparameters)

        # Updating the best hyperparameter set accordingly
        if exact > best_score:
            best_score = exact
            best_token_score = token
            best_hyperparameters = hyperparameters

    print(f"Best Hyperparameters: {best_hyperparameters}")
    print(f"Best Exact Score: {best_score}")
    print(f"Corresponding Per-Token Score: {best_token_score}")
    return best_hyperparameters

# Hyperparameter Space that was searched
hyperparameter_grid = {
    'embed_size': [75, 100, 150, 250],
    'hidden_size': [50, 100, 150, 200],
    'learning_rate': [0.001, 0.002, 0.005],
    'num_layers': [1],
    'num_epochs': [20],
    'dropout': [0.02],
    'teacher_forcing_ratio': [1],
    'batch_size': [20]
}

best_hps = hyperparameter_search(hyperparameter_grid)


Best Hyperparameters: {'embed_size': 75, 'hidden_size': 150, 'learning_rate': 0.005, 'num_layers': 1, 'num_epochs': 20, 'dropout': 0.02, 'teacher_forcing_ratio': 1, 'batch_size': 20}
Best Exact Score: 0.8357142857142857
Corresponding Per-Token Score: 0.9697974217311234


In [98]:
def main(epochs):
    best_hps={'embed_size': 75, 'hidden_size': 150, 'learning_rate': 0.005, 'num_layers': 1, 'num_epochs': 20, 'dropout': 0.02, 'teacher_forcing_ratio': 1, 'batch_size': 20}

    run_model(embed_size=best_hps['embed_size'],
              hidden_size=best_hps['hidden_size'],
              num_layers=best_hps['num_layers'],
              dropout=best_hps['dropout'],
              num_epochs=epochs,
              learning_rate=best_hps['learning_rate'],
              batch_size=best_hps['batch_size'],
              teacher_forcing_ratio=best_hps['teacher_forcing_ratio'],
              is_search=False)

In [99]:
# 5 epochs
main(5)

Epoch 1 Loss 0.8032
Epoch 2 Loss 0.2269
Epoch 3 Loss 0.1450
Epoch 4 Loss 0.1098
Epoch 5 Loss 0.0806
Test Per-token Accuracy: 0.8758747697974217
Test Exact-match Accuracy: 0.4885714285714286


In [148]:
# 20 epochs
main(20)



Epoch: 1 Loss 0.7505
Epoch: 2 Loss 0.2189
Epoch: 3 Loss 0.1542
Epoch: 4 Loss 0.1061
Epoch: 5 Loss 0.0760
Epoch: 6 Loss 0.0578
Epoch: 7 Loss 0.0417
Epoch: 8 Loss 0.0304
Epoch: 9 Loss 0.0240
Epoch: 10 Loss 0.0202
Epoch: 11 Loss 0.0190
Epoch: 12 Loss 0.0173
Epoch: 13 Loss 0.0168
Epoch: 14 Loss 0.0094
Epoch: 15 Loss 0.0061
Epoch: 16 Loss 0.0047
Epoch: 17 Loss 0.0042
Epoch: 18 Loss 0.0057
Epoch: 19 Loss 0.0048
Epoch: 20 Loss 0.0053
Test Per-token Accuracy: 0.972744014732965
Test Exact-match Accuracy: 0.8142857142857143
