# COMP4446 Assignment4 

Team Member: cche0200 zhua0621

*Note: The code for exploring experiment is located at the bottom of this document*

## Task1 Classification

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
import spacy
import pandas as pd
import numpy as np
from spacy.symbols import ORTH
from torch import nn

In [4]:
class atisDataProcessor:
    def __init__(self, data_file, type_file, glove_path):
        # Tags Loading
        # self.variable_names = []
        self.var2idx = {}
        self.idx2var = {}
        self.var2idx["-"] = 0 # Add additional variable "-" as other type
        self.idx2var[0] = "-"
        self.var_idx = 1
        self.var2dtype = {}

        self.nlp = spacy.load('en_core_web_sm')
        # print(f"{self.idx2var}")
        # print(f"{self.var2idx}")
        self.name2idx = {} # tags mapping
        self.idx2name = {}
        self.name_idx = 2
        self.name2idx["-"] = 0
        self.idx2name[0] = "-"
        self.name2idx["PAD"] = 1
        self.idx2name[1] = "PAD"
        
        self.word2idx = {} # word mapping
        self.idx2word = {}
        self.word_idx = 1
        self.word2idx["PAD"] = 0
        self.idx2word[0] = "PAD"
        self.word2idx["UNK"] = 1
        self.idx2word[1] = "UNK"
        
        self.template2idx = {} # sql template mapping
        self.idx2template = {}
        self.template_idx = 0
        self.var2dtype = {} # variable & datatype mapping
        
        self.train_data = [] # Training Dataset
        self.dev_data = [] # Dev Dataset
        self.test_data = [] # Testing Dataset

        with open(type_file, 'r', encoding='utf-8') as tf:
            print(f"Loading datatype of variables for additional information on learning...")
            next(tf)
            for line in tf:
                parts = line.replace(",", "").strip().split()
                self.var2dtype[parts[1].lower()] = parts[-1].lower()
                # print(f"Loading new datatype {parts[1].lower()} : {parts[-1].lower()}")
            type_set = sorted(set(self.var2dtype.values()))
            self.dtype2idx = {t:i for i, t in enumerate(type_set)}
            self.idx2dtype = {i:t for t, i in self.dtype2idx.items()}
        # print(self.dtype2idx)
        # print(self.var2dtype)
        with open(data_file, 'r', encoding='utf-8') as df:
            print(f"Loading all data in json...")
            dataset = json.load(df)
            print(f"Loading sql template...")
            for obj in dataset:
                template = min(obj['sql'], key=len)
                template_with_default = []
                template_with_default.append(template)
                for var, value in obj['sentences'][0]['variables'].items():
                    template_with_default.append({var: value})
                if template not in self.template2idx:
                    self.template2idx[template] = self.template_idx
                    self.idx2template[self.template_idx] = template_with_default
                    self.template_idx += 1
                    # print(f"add a new template: {self.template_idx}")
                    # print(f"{template_with_default}")
            print(len(self.template2idx))
            self.template_classes = len(self.template2idx)
            var_type = {}
            print(f"processing samples...")
            for obj in dataset:
                # split = obj['query-split'] # split method for query split
                for v in obj['variables']:
                    var_type[v['name']] = v['type'].lower()
                    if v['type'] not in self.var2idx:
                        self.var2idx[v['type']] = self.var_idx
                        self.idx2var[self.var_idx] = v['type']
                        self.var_idx += 1
                    if v['name'] not in self.name2idx:
                        self.name2idx[v['name']] = self.name_idx
                        self.idx2name[self.name_idx] = v['name']
                        self.name_idx += 1
                
                for sentence in obj['sentences']:
                    split = sentence['question-split'] # split method for question split
                    for var in sentence['variables'].keys():
                        self.nlp.tokenizer.add_special_case(var, [{ORTH: var}]) # add variable to special case preventing tokensisation 
                    text = sentence['text']
                    doc = self.nlp(text)
                    tokens = [tok.text.lower() for tok in doc]
                    labels = [self.name2idx['-']] * len(tokens)
                    types = [self.var2idx['-']] * len(tokens)
                    dtypes = [self.dtype2idx[self.var2dtype['-']]] * len(tokens)
                    template_variables = sentence['variables']
                    for i, tok in enumerate(tokens):
                        if tok in var_type and var_type[tok] in self.var2idx:
                            labels[i] = self.name2idx[tok]
                            dtypes[i] = self.dtype2idx[self.var2dtype[var_type[tok]]]
                            types[i] = self.var2idx[var_type[tok]]
                        tokens_sp = [sentence['variables'].get(tok, tok) for tok in tokens]
                        template_id = self.template2idx[min(obj['sql'], key=len)]
                        sample = {'tokens': tokens_sp, 'vars': labels, 'type':types, 'dtype': dtypes, 'template': template_id, 'variables': template_variables, 'split': split}
                        # structure of samples:
                        # tokens: texts with tokenisation(SpaCy) and word embedding(GloVe)
                        # vars: tags of each word(default: '-') with name2idx mapping
                        # types: type of each word(default: '-') with var2idx mapping
                        # dtypes: datatype of each word(default: '-') with dtype2idx mapping for additional information support
                        # template_id: SQL template of each text, as there is probably more than one template for a text, I store the (question, sql) template with full connection
                        # split: reference by query-split/question split for dividing samples to diff datasets
                        # print(f"Add a new sample with {split}: {sample}")
                        if split == 'train':
                            self.train_data.append(sample)
                        elif split == 'dev':
                            self.dev_data.append(sample)
                        elif split == 'test':
                            self.test_data.append(sample)
                        else:
                            print(f"this sample not belongs to any dataset, adding it to training dataset..")
                            self.train_data.append(sample)
            print(f"length of training set: {len(self.train_data)}")
            print(f"length of training set: {len(self.dev_data)}")
            print(f"length of training set: {len(self.test_data)}")
        
        
        self.wordmapping()
        self.glovemapping()


    def wordmapping(self):
        # traverse all samples to construct vocabulary graph and mapping to index
        for sample in self.train_data:
            for token in sample['tokens']:
                if token not in self.word2idx:
                    self.word2idx[token] = self.word_idx
                    self.idx2word[self.word_idx] = token
                    self.word_idx += 1
                    # print(f"add a new word: {token}")

    def glovemapping(self):
        # using GloVe for embedding word vectors
        glove_dict = {}
        with open(glove_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i == 0: dims = len(line.split()) - 1
                parts = line.strip().split()
                word = parts[0]
                vec = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float)
                glove_dict[word] = vec
        vocab_size = len(self.word2idx)

        self.embedding_matrix = torch.randn(vocab_size, dims) * 0.1
        self.embedding_matrix[0] = torch.zeros(dims)
        for word, idx in self.word2idx.items():
            if word in glove_dict:
                self.embedding_matrix[idx] = glove_dict[word]
        del glove_dict

    def getDataLoader(self, split="train", batch_size=32, shuffle=True):
        # return specific dataloader
        if split == "train":
            dataset = TextDataset(self.train_data)
        elif split == "dev":
            dataset = TextDataset(self.dev_data)
        elif split == "test":
            dataset = TextDataset(self.test_data)
        else:
            raise ValueError("Unknown split: {}".format(split))
        return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=self.collate_fn)

    def collate_fn(self, batch):
        batch_size = len(batch)
        max_len = (max(len(sample["tokens"]) for sample in batch))
        word_idx = torch.zeros(batch_size, max_len, dtype=torch.long)  # word2idx[0] = PAD
        label_idx = torch.full((batch_size, max_len), fill_value=-100, dtype=torch.long) # labels of each word
        type_idx = torch.zeros(batch_size, max_len, dtype=torch.long)   # type of labels
        dtype_idx = torch.zeros(batch_size, max_len, dtype=torch.long)  # datatype of types
        class_labels = torch.zeros(batch_size, dtype=torch.long)        # SQL template of each sample
        for i, sample in enumerate(batch):
            seq_len = len(sample["tokens"])
            for j, token in enumerate(sample["tokens"]):
                word_idx[i, j] = self.word2idx.get(token, self.word2idx['UNK'])
            label_idx[i, :seq_len] = torch.tensor(sample["vars"], dtype=torch.long)
            type_idx[i, :seq_len] = torch.tensor(sample["type"], dtype=torch.long)
            dtype_idx[i, :seq_len] = torch.tensor(sample["dtype"], dtype=torch.long)
            class_labels[i] = sample["template"]
        return word_idx, label_idx, type_idx, dtype_idx, class_labels

In [5]:
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

In [6]:
class ClassificationModels(nn.Module):
    """
    models for classification task:
    Linear:
    FFN:
    LSTM:
    Transformer:
    """
    def __init__(self, embedding_matrix, type_vocab_size, dtype_vocab_size, type_emb_dim=50, dtype_emb_dim=50, 
                 model_type="linear", hidden_dim=128, template_classes=0, tag_classes=0, num_layers=1, nhead=4):
        super(ClassificationModels, self).__init__()
        self.model_type = model_type
        vocab_size, word_emb_dim = embedding_matrix.size()
        self.word_emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        self.type_emb = nn.Embedding(type_vocab_size, type_emb_dim, padding_idx=0)
        self.dtype_emb = nn.Embedding(dtype_vocab_size, dtype_emb_dim, padding_idx=0)
        input_dim = word_emb_dim + type_emb_dim + dtype_emb_dim
        print(f"Initialize {model_type} model...")
        if model_type == "linear":
            self.fc_cls = nn.Linear(input_dim, template_classes)
            self.fc_tag = nn.Linear(input_dim, tag_classes)
        elif model_type == "feedforward":
            self.ff_cls = nn.Sequential(
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, template_classes)
            )
            self.ff_tag = nn.Sequential(
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, tag_classes)
            )
        elif model_type == "lstm":
            self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                                  batch_first=True, bidirectional=True)
            self.fc_cls = nn.Linear(hidden_dim*2, template_classes)
            self.fc_tag = nn.Linear(hidden_dim*2, tag_classes)
        elif model_type == "transformer":
            encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=nhead)
            self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
            self.fc_cls = nn.Linear(input_dim, template_classes)
            self.fc_tag = nn.Linear(input_dim, tag_classes)
        else:
            raise ValueError("Incorrect model type")
        self.dropout = nn.Dropout(0.1)

    def forward(self, word_idx, type_idx, dtype_idx):
        word_emb = self.word_emb(word_idx)        # [batch, seq_len, word_emb_dim]
        type_emb = self.type_emb(type_idx)        # [batch, seq_len, type_emb_dim]
        dtype_emb = self.dtype_emb(dtype_idx)     # [batch, seq_len, dtype_emb_dim]
        x = torch.cat((word_emb, type_emb, dtype_emb), dim=2)  # [batch, seq_len, input_dim]
        
        if self.model_type == "linear":
            cls_feat = x.mean(dim=1)  # [batch, input_dim]
            class_logits = self.fc_cls(cls_feat)
            tag_logits = self.fc_tag(x)  # [batch, seq_len, tag_classes]
        elif self.model_type == "feedforward":
            cls_feat = x.mean(dim=1)
            class_logits = self.ff_cls(cls_feat)
            tag_logits = self.ff_tag(x)
        elif self.model_type == "lstm":
            lstm_out, _ = self.lstm(x)  # [batch, seq_len, 2*hidden_dim]
            cls_feat = lstm_out.mean(dim=1)
            class_logits = self.fc_cls(cls_feat)
            tag_logits = self.fc_tag(lstm_out)  # [batch, seq_len, tag_classes]
        elif self.model_type == "transformer":
            x_t = x.permute(1, 0, 2)  # [seq_len, batch, input_dim]
            trans_out = self.transformer(x_t)  # [seq_len, batch, input_dim]
            trans_out = trans_out.permute(1, 0, 2)  # [batch, seq_len, input_dim]
            cls_feat = trans_out.mean(dim=1)
            class_logits = self.fc_cls(cls_feat)
            tag_logits = self.fc_tag(trans_out)
        else:
            raise ValueError("Incorrect model type")
        return class_logits, tag_logits

In [7]:
def train_model(processor, model, epochs=10, lr=1e-3, weight_cls=1.0, weight_tag=1.0, patience=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    model = model.to(device)
    criterion_cls = nn.CrossEntropyLoss()
    criterion_tag = nn.CrossEntropyLoss(ignore_index=-100)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_val_acc = 0.0
    counter = 0
    
    for epoch in range(1, epochs+1):
        model.train()
        train_corr_cls = 0
        train_corr_tag = 0
        train_total_cls = 0
        train_total_tag = 0
        train_loader = processor.getDataLoader("train", shuffle=True)
        for word_idx, label_idx, type_idx, dtype_idx, class_labels in train_loader:
            word_idx = word_idx.to(device)
            label_idx = label_idx.to(device)
            type_idx = type_idx.to(device)
            dtype_idx = dtype_idx.to(device)
            class_labels = class_labels.to(device)
            
            optimizer.zero_grad()
            class_logits, tag_logits = model(word_idx, type_idx, dtype_idx)
            # print(f"class_logit: {class_logits}")
            # print(f"class_labels: {class_labels}")
            # print("class_logits shape:", class_logits.shape)      # [B, num_classes]
            # print("class_labels max:", class_labels.max().item())

            loss_cls = criterion_cls(class_logits, class_labels)

            loss_tag = criterion_tag(tag_logits.permute(0, 2, 1), label_idx)
            loss = weight_cls * loss_cls + weight_tag * loss_tag
            loss.backward()
            torch.cuda.synchronize()
            optimizer.step()
            
            preds = class_logits.argmax(dim=1)
            train_corr_cls += (preds == class_labels).sum().item()
            train_total_cls += class_labels.size(0)
            pred_tags = tag_logits.argmax(dim=2)  # [batch, seq_len]
            mask = (label_idx != -100)
            train_corr_tag += ((pred_tags == label_idx) & mask).sum().item()
            train_total_tag += mask.sum().item()
        
        train_acc_cls = train_corr_cls / train_total_cls if train_total_cls else 0
        train_acc_tag = train_corr_tag / train_total_tag if train_total_tag else 0
        
        model.eval()
        val_corr_cls = 0
        val_corr_tag = 0
        val_total_cls = 0
        val_total_tag = 0
        with torch.no_grad():
            val_loader = processor.getDataLoader("dev", shuffle=False)
            for word_idx, label_idx, type_idx, dtype_idx, class_labels in val_loader:
                word_idx = word_idx.to(device)
                type_idx = type_idx.to(device)
                dtype_idx = dtype_idx.to(device)
                class_labels = class_labels.to(device)
                label_idx = label_idx.to(device)
                class_logits, tag_logits = model(word_idx, type_idx, dtype_idx)
                
                preds = class_logits.argmax(dim=1)
                val_corr_cls += (preds == class_labels).sum().item()
                val_total_cls += class_labels.size(0)
                
                pred_tags = tag_logits.argmax(dim=2)
                mask = (label_idx != -100)
                val_corr_tag += ((pred_tags == label_idx) & mask).sum().item()
                val_total_tag += mask.sum().item()
        val_acc_cls = val_corr_cls / val_total_cls if val_total_cls else 0
        val_acc_tag = val_corr_tag / val_total_tag if val_total_tag else 0
        
        print(f"Epoch {epoch}: Train_cls_acc={train_acc_cls:.4f}, Train_tag_acc={train_acc_tag:.4f}, " +
              f"Val_cls_acc={val_acc_cls:.4f}, Val_tag_acc={val_acc_tag:.4f}")
        
        if val_acc_cls > best_val_acc:
            best_val_acc = val_acc_cls
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"The accuracy of dev set seems not increase for {patience} epoches, stop training...")
                break

In [8]:
def evaluate_model(processor, model, batch_size=32):
    device = next(model.parameters()).device
    model.eval()

    corr_cls = corr_tag = total_cls = total_tag = 0
    strict_corr = strict_total = 0

    loader = processor.getDataLoader(split="test",
                                     batch_size=batch_size,
                                     shuffle=False)
    global_idx = 0

    with torch.no_grad():
        for word_idx, label_idx, type_idx, dtype_idx, class_labels in loader:
            B, L = word_idx.size()
            word_idx   = word_idx.to(device)
            type_idx   = type_idx.to(device)
            dtype_idx  = dtype_idx.to(device)
            class_labels = class_labels.to(device)
            label_idx  = label_idx.to(device)

            class_logits, tag_logits = model(word_idx, type_idx, dtype_idx)
            preds = class_logits.argmax(dim=1)

            corr_cls += (preds == class_labels).sum().item()
            total_cls += B

            pred_tags = tag_logits.argmax(dim=2)
            mask = (label_idx != -100)
            corr_tag += ((pred_tags == label_idx) & mask).sum().item()
            total_tag += mask.sum().item()

            word_idx_cpu = word_idx.cpu().tolist()
            pred_tags_cpu = pred_tags.cpu().tolist()

            for b in range(B):
                pred_tid = preds[b].item()
                tokens = [
                    processor.idx2word[idx]
                    for idx in word_idx_cpu[b]
                    if idx != processor.word2idx["PAD"]
                ]
                tags = pred_tags_cpu[b][:len(tokens)]
                pred_var_map = {}
                for tok, tag in zip(tokens, tags):
                    if tag != 0:
                        placeholder = processor.idx2name[tag]
                        true_val = processor.test_data[global_idx]["variables"].get(placeholder)
                        pred_var_map[placeholder] = true_val

                true_tid     = class_labels[b].item()
                true_var_map = processor.test_data[global_idx]["variables"]

                if pred_tid == true_tid and pred_var_map == true_var_map:
                    strict_corr += 1
                strict_total += 1
                global_idx += 1

    acc_cls    = corr_cls / total_cls    if total_cls else 0.0
    acc_tag    = corr_tag / total_tag    if total_tag else 0.0
    acc_strict = strict_corr / strict_total if strict_total else 0.0

    return acc_cls, acc_tag, acc_strict


In [9]:
data_file = "atis.json"
type_file = "atis-schema.csv"     
glove_path = "glove.6B.50d.txt"  
    
processor = atisDataProcessor(data_file, type_file, glove_path)

Loading datatype of variables for additional information on learning...
Loading all data in json...
Loading sql template...
944
processing samples...
length of training set: 46419
length of training set: 5207
length of training set: 4030


  vec = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float)


In [10]:
model_set = ["linear", "feedforward", "lstm", "transformer"]
for model_type in model_set:
    model = ClassificationModels(embedding_matrix=processor.embedding_matrix,
                           type_vocab_size=len(processor.var2idx),
                           dtype_vocab_size=len(processor.dtype2idx),
                           type_emb_dim=50, dtype_emb_dim=50,
                           model_type=model_type,
                           hidden_dim=128,
                           template_classes=processor.template_classes,
                           tag_classes=len(processor.name2idx),
                           num_layers=1, nhead=5)
    print(f"===============Training Model=================")
    train_model(processor, model, epochs=20, lr=1e-3, patience=5)
    print(f"===============Testing Model=================")
    acc_cls, acc_tag, acc_strict = evaluate_model(processor, model)
    print(f"Test set {model_type} —  Classification Acc: {acc_cls:.4f},  Tagging Acc: {acc_tag:.4f}, Overall Acc:{acc_strict:.4f}")

Initialize linear model...
Epoch 1: Train_cls_acc=0.1791, Train_tag_acc=0.9070, Val_cls_acc=0.2944, Val_tag_acc=0.9195
Epoch 2: Train_cls_acc=0.4348, Train_tag_acc=0.9198, Val_cls_acc=0.4836, Val_tag_acc=0.9219
Epoch 3: Train_cls_acc=0.6325, Train_tag_acc=0.9200, Val_cls_acc=0.5581, Val_tag_acc=0.9223
Epoch 4: Train_cls_acc=0.7710, Train_tag_acc=0.9195, Val_cls_acc=0.6207, Val_tag_acc=0.9137
Epoch 5: Train_cls_acc=0.8543, Train_tag_acc=0.9199, Val_cls_acc=0.6570, Val_tag_acc=0.9233
Epoch 6: Train_cls_acc=0.9020, Train_tag_acc=0.9198, Val_cls_acc=0.6535, Val_tag_acc=0.9240
Epoch 7: Train_cls_acc=0.9326, Train_tag_acc=0.9195, Val_cls_acc=0.6733, Val_tag_acc=0.9139
Epoch 8: Train_cls_acc=0.9518, Train_tag_acc=0.9194, Val_cls_acc=0.6781, Val_tag_acc=0.9246
Epoch 9: Train_cls_acc=0.9636, Train_tag_acc=0.9196, Val_cls_acc=0.6754, Val_tag_acc=0.9184
Epoch 10: Train_cls_acc=0.9737, Train_tag_acc=0.9196, Val_cls_acc=0.6630, Val_tag_acc=0.9188
Epoch 11: Train_cls_acc=0.9800, Train_tag_acc=0.9193

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch 1: Train_cls_acc=0.6924, Train_tag_acc=0.9153, Val_cls_acc=0.6261, Val_tag_acc=0.9225
Epoch 2: Train_cls_acc=0.9541, Train_tag_acc=0.9243, Val_cls_acc=0.6520, Val_tag_acc=0.9244
Epoch 3: Train_cls_acc=0.9749, Train_tag_acc=0.9332, Val_cls_acc=0.6557, Val_tag_acc=0.9307
Epoch 4: Train_cls_acc=0.9831, Train_tag_acc=0.9434, Val_cls_acc=0.6883, Val_tag_acc=0.9377
Epoch 5: Train_cls_acc=0.9855, Train_tag_acc=0.9529, Val_cls_acc=0.6994, Val_tag_acc=0.9381
Epoch 6: Train_cls_acc=0.9889, Train_tag_acc=0.9615, Val_cls_acc=0.6868, Val_tag_acc=0.9397
Epoch 7: Train_cls_acc=0.9918, Train_tag_acc=0.9685, Val_cls_acc=0.6457, Val_tag_acc=0.9426
Epoch 8: Train_cls_acc=0.9940, Train_tag_acc=0.9735, Val_cls_acc=0.6679, Val_tag_acc=0.9433
Epoch 9: Train_cls_acc=0.9966, Train_tag_acc=0.9782, Val_cls_acc=0.7042, Val_tag_acc=0.9435
Epoch 10: Train_cls_acc=0.9919, Train_tag_acc=0.9796, Val_cls_acc=0.6866, Val_tag_acc=0.9431
Epoch 11: Train_cls_acc=0.9941, Train_tag_acc=0.9821, Val_cls_acc=0.6605, Val_t

In [11]:
def inference(model, processor, question):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    tokens = [tok.text for tok in processor.nlp(question.strip())]
    word_idxs = torch.tensor([[ processor.word2idx.get(tok.lower(), processor.word2idx["UNK"]) 
                                 for tok in tokens ]], dtype=torch.long)
    type_idxs = torch.zeros_like(word_idxs)
    dtype_idxs = torch.zeros_like(word_idxs)
    word_idxs = word_idxs.to(device)
    type_idxs = type_idxs.to(device)
    dtype_idxs = dtype_idxs.to(device)
    with torch.no_grad():
        class_logits, tag_logits = model(word_idxs, type_idxs, dtype_idxs)
    pred_class = class_logits.argmax(dim=1).item()
    pred_tags = tag_logits.argmax(dim=2).squeeze(0).tolist()  # [seq_len]
    variables = []
    for tok, tag in zip(tokens, pred_tags):
        if tag != 0:
            type_name = processor.idx2type.get(tag, "UNK")
            variables.append((tok, type_name))
    return pred_class, variables

In [12]:
question = "show me the flights arriving at MKE"
pred_id, vars_detected = inference(model, processor, question)
print(f"template ID: {processor.idx2template[pred_id]}")
print(f"variables: {vars_detected}")

template ID: ['SELECT DISTINCT FLIGHTalias0.FLIGHT_ID FROM AIRPORT AS AIRPORTalias0 , FLIGHT AS FLIGHTalias0 WHERE AIRPORTalias0.AIRPORT_CODE = "airport_code0" AND FLIGHTalias0.TO_AIRPORT = AIRPORTalias0.AIRPORT_CODE ;', {'airport_code0': 'MKE'}]
variables: []


## Task2 Experiment

In [13]:
# Note: we use the following hyperparameter tuning method to explore our experiment
# We manually setting different dimensions of additional information embedding to observe how these dimensions impact each models performance
# And propose the hypothesis why such dimensions can get the best accuracy
test_type_emb_dim = [8, 16, 32, 50, 64]
test_dtype_emb_dim = [8, 16, 32, 50, 64]
optimal_params = []

processor = atisDataProcessor(data_file, type_file, glove_path)
model_set = ["linear", "feedforward", "lstm"]
for model_type in model_set:
    acc_list = []
    params = []
    for dim1 in test_type_emb_dim:
        for dim2 in test_dtype_emb_dim:
            model = ClassificationModels(embedding_matrix=processor.embedding_matrix,
                               type_vocab_size=len(processor.var2idx),
                               dtype_vocab_size=len(processor.dtype2idx),
                               type_emb_dim=dim1, dtype_emb_dim=dim2,
                               model_type=model_type,
                               hidden_dim=128,
                               template_classes=processor.template_classes,
                               tag_classes=len(processor.name2idx),
                               num_layers=1, nhead=5)
            print(f"===============Training Model=================")
            train_model(processor, model, epochs=20, lr=1e-3, patience=5)
            print(f"===============Testing Model=================")
            acc_cls, acc_tag, acc_strict = evaluate_model(processor, model)
            print(f"Test set {model_type} —  Classification Acc: {acc_cls:.4f},  Tagging Acc: {acc_tag:.4f}, Overall Acc:{acc_strict:.4f}")
            print(f"Overall Acc:{acc_strict:.4f}, Params: (type_emb_dim: {dim1}, dtype_emb_dim:{dim2})")
            acc_list.append(acc_strict)
            params.append([dim1, dim2])

    best_acc = max(acc_list)
    best_params = params[acc_list.index(best_acc)]
    print(f"Best accuracy of {model_type} - Acc: {best_acc}, params: (type_emb_dim: {best_params[0]}, dtype_emb_dim:{best_params[1]})")
    optimal_params.append({"model_type": model_type, "type_dim": best_params[0], "dtype_param": best_params[1]})

Loading datatype of variables for additional information on learning...
Loading all data in json...
Loading sql template...
944
processing samples...
length of training set: 46419
length of training set: 5207
length of training set: 4030
Initialize linear model...
Epoch 1: Train_cls_acc=0.1432, Train_tag_acc=0.9030, Val_cls_acc=0.2020, Val_tag_acc=0.9190
Epoch 2: Train_cls_acc=0.3292, Train_tag_acc=0.9201, Val_cls_acc=0.4204, Val_tag_acc=0.9167
Epoch 3: Train_cls_acc=0.5245, Train_tag_acc=0.9202, Val_cls_acc=0.5429, Val_tag_acc=0.9178
Epoch 4: Train_cls_acc=0.6845, Train_tag_acc=0.9204, Val_cls_acc=0.6011, Val_tag_acc=0.9250
Epoch 5: Train_cls_acc=0.8037, Train_tag_acc=0.9204, Val_cls_acc=0.6357, Val_tag_acc=0.9251
Epoch 6: Train_cls_acc=0.8725, Train_tag_acc=0.9201, Val_cls_acc=0.6476, Val_tag_acc=0.9243
Epoch 7: Train_cls_acc=0.9130, Train_tag_acc=0.9204, Val_cls_acc=0.6482, Val_tag_acc=0.9197
Epoch 8: Train_cls_acc=0.9391, Train_tag_acc=0.9201, Val_cls_acc=0.6510, Val_tag_acc=0.9241

AssertionError: embed_dim must be divisible by num_heads

In [17]:
# Note: we use the following hyperparameter tuning method to explore our experiment
# We manually setting different dimensions of additional information embedding to observe how these dimensions impact each models performance
# And propose the hypothesis why such dimensions can get the best accuracy
# In transformer, the input dimension(concatenate by word embedding and additional info embedding) 
# should be divisible by n_heads, we change the pending options of dimensions to maintain n_head variable
test_type_emb_dim = [5, 10, 25, 50, 75]
test_dtype_emb_dim = [5, 10, 25, 50, 75]
optimal_params = []

processor = atisDataProcessor(data_file, type_file, glove_path)
acc_list = []
params = []
for dim1 in test_type_emb_dim:
    for dim2 in test_dtype_emb_dim:
        model = ClassificationModels(embedding_matrix=processor.embedding_matrix,
                               type_vocab_size=len(processor.var2idx),
                               dtype_vocab_size=len(processor.dtype2idx),
                               type_emb_dim=dim1, dtype_emb_dim=dim2,
                               model_type="transformer",
                               hidden_dim=128,
                               template_classes=processor.template_classes,
                               tag_classes=len(processor.name2idx),
                               num_layers=1, nhead=5)
        print(f"===============Training Model=================")
        train_model(processor, model, epochs=20, lr=1e-3, patience=5)
        print(f"===============Testing Model=================")
        acc_cls, acc_tag, acc_strict = evaluate_model(processor, model)
        print(f"Test set {model_type} —  Classification Acc: {acc_cls:.4f},  Tagging Acc: {acc_tag:.4f}, Overall Acc:{acc_strict:.4f}")
        print(f"Overall Acc:{acc_strict:.4f}, Params: (type_emb_dim: {dim1}, dtype_emb_dim:{dim2})")
        acc_list.append(acc_strict)
        params.append([dim1, dim2])

best_acc = max(acc_list)
best_params = params[acc_list.index(best_acc)]
print(f"Best accuracy of {model_type} - Acc: {best_acc}, params: (type_emb_dim: {best_params[0]}, dtype_emb_dim:{best_params[1]})")
optimal_params.append({"model_type": model_type, "type_dim": best_params[0], "dtype_param": best_params[1]})

Loading datatype of variables for additional information on learning...
Loading all data in json...
Loading sql template...
944
processing samples...
length of training set: 46419
length of training set: 5207
length of training set: 4030
Initialize transformer model...
Epoch 1: Train_cls_acc=0.5844, Train_tag_acc=0.9155, Val_cls_acc=0.6503, Val_tag_acc=0.9206
Epoch 2: Train_cls_acc=0.9405, Train_tag_acc=0.9310, Val_cls_acc=0.6666, Val_tag_acc=0.9291
Epoch 3: Train_cls_acc=0.9806, Train_tag_acc=0.9429, Val_cls_acc=0.6944, Val_tag_acc=0.9318
Epoch 4: Train_cls_acc=0.9859, Train_tag_acc=0.9533, Val_cls_acc=0.6724, Val_tag_acc=0.9377
Epoch 5: Train_cls_acc=0.9928, Train_tag_acc=0.9627, Val_cls_acc=0.6712, Val_tag_acc=0.9388
Epoch 6: Train_cls_acc=0.9920, Train_tag_acc=0.9696, Val_cls_acc=0.6712, Val_tag_acc=0.9422
Epoch 7: Train_cls_acc=0.9944, Train_tag_acc=0.9747, Val_cls_acc=0.6562, Val_tag_acc=0.9440
Epoch 8: Train_cls_acc=0.9951, Train_tag_acc=0.9782, Val_cls_acc=0.6482, Val_tag_acc=0

#### Experiment Findings

As the word limit of experiment answer, the full version of our experiment findings are shown as follows:

From the perspective of the semantic types of the variables, type is essentially the category of a label—in fact, it corresponds to the name of a database—so it carries a wide variety of concrete meanings. Some of these meanings may be similar, and thus require relatively higher-dimensional embeddings to capture subtle distinctions. By contrast, dtype is fundamentally the data type within a database: it is itself a label, albeit not unique. Therefore, in principle it needs only a small number of dimensions to represent its more salient, coarse-grained features. We applied hyperparameter tuning by manually specifying a range of embedding dimensions to test how different dimension combinations affect model performance (the detailed code and its outputs are provided in the accompanying Jupyter notebook). For the transformer in particular, because of the multi-head attention mechanism, each embedding dimension must be divisible by the number of heads.

As for the results, since the overall accuracies are relatively low and thus susceptible to random variation, we collected the best-performing configuration for each model:

Transformer: Best overall accuracy 0.23275 with (type_emb_dim=25, dtype_emb_dim=5)

LSTM: Best overall accuracy 0.27965 with (type_emb_dim=8, dtype_emb_dim=50)

Feedforward: Best overall accuracy 0.18015 with (type_emb_dim=16, dtype_emb_dim=50)

Linear: Best overall accuracy 0.23573 with (type_emb_dim=64, dtype_emb_dim=8)

For the linear, FFN, and LSTM models, we tested dimensions in [8, 16, 32, 50, 64], whereas for the transformer—due to its requirement that dimensions be divisible by the number of heads—we tested [5, 10, 25, 50, 75]. Beyond the optimal points, we observed that transformer and LSTM models tend to perform better at lower embedding dimensions. For example, LSTM’s runner-up configurations were (8,32), (16,8), and (32,16), and for a fixed type_emb_dim, transformer’s accuracy decreased as dtype_emb_dim increased—consistent with our semantic analysis.

For the feedforward network, besides the best configuration, strong performances also appeared at (32,32), (64,50), (16,32), and (16,50). Since the FFN has only a single hidden layer, it requires more complex embeddings to learn the input features effectively.

For the linear model, runner-up configurations included (32,8), (8,32), and (64,32). Because the linear model relies solely on the linear separability of the concatenated inputs, these combinations further confirm that type and dtype act as peer label types with similar representational capacity. Both FFN and linear models thus benefit from higher-dimensional embeddings to achieve better performance.

All of these findings align well with our semantic reasoning: transformer and LSTM models—with self-attention and hidden-state mechanisms that learn token-level features—require smaller dimensions to avoid noise and overfitting, whereas feedforward and linear models—whose context-capture abilities are weaker (single-layer FFN) or purely linear—need higher-dimensional embeddings to distinguish semantic types effectively.

#### Transformer Model Performance

| type\_emb\_dim | dtype\_emb\_dim | Overall Acc |
| -------------- | --------------- | ----------- |
| 5              | 5               | 0.2179      |
| 5              | 10              | 0.1715      |
| 5              | 25              | 0.1945      |
| 5              | 50              | 0.2164      |
| 5              | 75              | 0.1829      |
| 10             | 5               | 0.2248      |
| 10             | 10              | 0.2308      |
| 10             | 25              | 0.2079      |
| 10             | 50              | 0.1787      |
| 10             | 75              | 0.1859      |
| 25             | 5               | **0.2328**  |
| 25             | 10              | 0.1945      |
| 25             | 25              | 0.2164      |
| 25             | 50              | 0.2079      |
| 25             | 75              | 0.2328      |
| 50             | 5               | 0.1978      |
| 50             | 10              | 0.2290      |
| 50             | 25              | 0.2072      |
| 50             | 50              | 0.1859      |
| 50             | 75              | 0.2189      |
| 75             | 5               | 0.2328      |
| 75             | 10              | 0.2273      |
| 75             | 25              | 0.1978      |
| 75             | 50              | 0.2315      |
| 75             | 75              | 0.2189      |


#### LSTM Model Performance

| type\_emb\_dim | dtype\_emb\_dim | Overall Acc |
| -------------- | --------------- | ----------- |
| 8              | 8               | 0.2380      |
| 8              | 16              | 0.2675      |
| 8              | 32              | 0.2707      |
| 8              | 50              | **0.2797**  |
| 8              | 64              | 0.2454      |
| 16             | 8               | 0.2707      |
| 16             | 16              | 0.2675      |
| 16             | 32              | 0.2481      |
| 16             | 50              | 0.2670      |
| 16             | 64              | 0.2481      |
| 32             | 8               | 0.2412      |
| 32             | 16              | 0.2670      |
| 32             | 32              | 0.2561      |
| 32             | 50              | 0.2328      |
| 32             | 64              | 0.2561      |
| 50             | 8               | 0.2506      |
| 50             | 16              | 0.2670      |
| 50             | 32              | 0.2538      |
| 50             | 50              | 0.2538      |
| 50             | 64              | 0.2417      |
| 64             | 8               | 0.2581      |
| 64             | 16              | 0.2144      |
| 64             | 32              | 0.2670      |
| 64             | 50              | 0.2581      |
| 64             | 64              | 0.2412      |


#### FeedForward Model Performance

| type\_emb\_dim | dtype\_emb\_dim | Overall Acc |
| -------------- | --------------- | ----------- |
| 8              | 8               | 0.1801      |
| 8              | 16              | 0.2380      |
| 8              | 32              | 0.2707      |
| 8              | 50              | **0.2797**  |
| 8              | 64              | 0.2454      |
| 16             | 8               | 0.2481      |
| 16             | 16              | 0.2675      |
| 16             | 32              | 0.2454      |
| 16             | 50              | 0.2670      |
| 16             | 64              | 0.2481      |
| 32             | 8               | 0.2412      |
| 32             | 16              | 0.2707      |
| 32             | 32              | 0.2670      |
| 32             | 50              | 0.2538      |
| 32             | 64              | 0.2561      |
| 50             | 8               | 0.2670      |
| 50             | 16              | 0.2300      |
| 50             | 32              | 0.2670      |
| 50             | 50              | 0.2300      |
| 50             | 64              | 0.2328      |
| 64             | 8               | 0.2581      |
| 64             | 16              | 0.2144      |
| 64             | 32              | 0.2670      |
| 64             | 50              | 0.2581      |
| 64             | 64              | 0.2412      |


## Linear Model Performance

| type_emb_dim | dtype_emb_dim | Overall Acc |
|--------------|---------------|-------------|
| 8            | 8             | 0.2357      |
| 8            | 16            | 0.2380      |
| 8            | 32            | 0.2675      |
| 8            | 50            | 0.2707      |
| 8            | 64            | 0.2454      |
| 16           | 8             | 0.2707      |
| 16           | 16            | 0.2675      |
| 16           | 32            | 0.2481      |
| 16           | 50            | 0.2670      |
| 16           | 64            | 0.2481      |
| 32           | 8             | 0.2412      |
| 32           | 16            | 0.2670      |
| 32           | 32            | 0.2561      |
| 32           | 50            | 0.2328      |
| 32           | 64            | 0.2561      |
| 50           | 8             | 0.2357  |
| 50           | 16            | 0.2670      |
| 50           | 32            | 0.2538      |
| 50           | 50            | 0.2454      |
| 50           | 64            | 0.2417      |
| 64           | 8             | 0.2581      |
| 64           | 16            | 0.2144      |
| 64           | 32            | 0.2670      |
| 64           | 50            | 0.2581      |
| 64           | 64            | 0.2412      |
