In [18]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
import spacy
import pandas as pd
import numpy as np
from spacy.symbols import ORTH

In [16]:
class atisDataProcessor:
    def __init__(self, tags_file, data_file, type_file, glove_path):
        # Tags Loading
        self.variable_names = []
        self.var2idx = {}
        self.idx2var = {}
        self.var2idx["-"] = 0 # Add additional variable "-" as other type
        self.idx2var[0] = "-"
        self.var_idx = 1
        try:
            with open(tags_file, 'r', encoding='utf-8') as tgf:
                print(f"Tags Loading.....")
                for line in tgf:
                    parts = line.strip().split()
                    if len(parts) >= 2:
                        # self.variable_names.append(parts[1].lower())
                        if parts[1].lower() not in self.var2idx:
                            self.var2idx[parts[1].lower()] = self.var_idx
                            self.idx2var[self.var_idx] = parts[1].lower()
                            self.var_idx += 1
                        print(f"Tags insert {parts[1].lower()} in {self.var_idx}")
        except Exception as e:
            print(f"Error: cannot read schema file {tags_file}")
        
        nlp = spacy.load('en_core_web_sm')
        print(f"{self.idx2var}")
        print(f"{self.var2idx}")
        self.name2idx = {} # tags mapping
        self.idx2name = {}
        self.name_idx = 2
        self.name2idx["-"] = 0
        self.idx2name[0] = "-"
        self.name2idx["PAD"] = 1
        self.idx2name[1] = "PAD"
        
        self.word2idx = {} # word mapping
        self.idx2word = {}
        self.word_idx = 1
        self.word2idx["PAD"] = 0
        self.idx2word[0] = "PAD"
        self.word2idx["UNK"] = 1
        self.idx2word[1] = "UNK"
        
        self.template2idx = {} # sql template mapping
        self.idx2template = {}
        self.template_idx = 0
        self.var2dtype = {} # variable & datatype mapping
        
        self.train_data = [] # Training Dataset
        self.dev_data = [] # Dev Dataset
        self.test_data = [] # Testing Dataset
        
        try:
            with open(type_file, 'r', encoding='utf-8') as tf:
                print(f"Loading datatype of variables for additional information on learning...")
                next(tf)
                for line in tf:
                    parts = line.replace(",", "").strip().split()
                    self.var2dtype[parts[1].lower()] = parts[-1].lower()
                    type_set = sorted(set(self.var2dtype.values()))
                    self.dtype2idx = {t:i for i, t in enumerate(type_set)}
                    self.idx2dtype = {i:t for t, i in self.dtype2idx.items()}
                    print(f"Loading new datatype {parts[1].lower()} : {parts[-1].lower()}")
        except Exception as e:
            print(f"Error: cannot read schema file {type_file}")
            
        with open(data_file, 'r', encoding='utf-8') as df:
            print(f"Loading all data in json...")
            dataset = json.load(df)
            print(f"Loading sql template...")
            for obj in dataset:
                for sql in obj['sql']:
                    self.template2idx[sql] = self.template_idx
                    self.idx2template[self.template_idx] = sql
                    self.template_idx += 1
                    print(f"Loading new template, index: {self.template_idx}")
            var_type = {}
            print(f"processing samples...")
            for obj in dataset:
                split = obj['query-split']
                for v in obj['variables']:
                    var_type[v['name']] = v['type'].lower()
                    if v['name'] not in self.name2idx:
                        self.name2idx[v['name']] = self.name_idx
                        self.idx2name[self.name_idx] = v['name']
                        self.name_idx += 1
                for sentence in obj['sentences']:
                    for var in sentence['variables'].keys():
                        nlp.tokenizer.add_special_case(var, [{ORTH: var}]) # add variable to special case preventing tokensisation 
                    text = sentence['text']
                    doc = nlp(text)
                    tokens = [tok.text.lower() for tok in doc]
                    labels = [self.name2idx['-']] * len(tokens)
                    types = [self.var2idx['-']] * len(tokens)
                    dtypes = [self.dtype2idx[self.var2dtype['-']]] * len(tokens)
                    for i, tok in enumerate(tokens):
                        if tok in var_type and var_type[tok] in self.var2idx:
                            labels[i] = self.name2idx[tok]
                            dtypes[i] = self.dtype2idx[self.var2dtype[var_type[tok]]]
                            types[i] = self.var2idx[var_type[tok]]
                    tokens_sp = [sentence['variables'].get(tok, tok) for tok in tokens]
                    for sql in obj['sql']:
                        template_id = self.template2idx[sql]
                        sample = {'tokens': tokens_sp, 'vars': labels, 'type':types, 'dtype': dtypes, 'template': template_id, 'split': split}
                        # structure of samples:
                        # tokens: texts with tokenisation(SpaCy) and word embedding(GloVe)
                        # vars: tags of each word(default: '-') with name2idx mapping
                        # types: type of each word(default: '-') with var2idx mapping
                        # dtypes: datatype of each word(default: '-') with dtype2idx mapping for additional information support
                        # template_id: SQL template of each text, as there is probably more than one template for a text, I store the (question, sql) template with full connection
                        # split: reference by query-split/question split for dividing samples to diff datasets
                        print(f"Add a new sample with {split}: {sample}")
                        if split == 'train':
                            self.train_data.append(sample)
                        elif split == 'dev':
                            self.dev_data.append(sample)
                        elif split == 'test':
                            self.test_data.append(sample)
                        else:
                            print(f"this sample not belongs to any dataset, adding it to training dataset..")
                            self.train_data.append(sample)
            print(f"length of training set: {len(self.train_data)}")
            print(f"length of training set: {len(self.dev_data)}")
            print(f"length of training set: {len(self.test_data)}")
            self.wordmapping()
            self.glovemapping()


    def wordmapping(self):
        # traverse all samples to construct vocabulary graph and mapping to index
        for sample in self.train_data:
            for token in sample['tokens']:
                if token not in self.word2idx:
                    self.word2idx[token] = self.word_idx
                    self.idx2word[self.word_idx] = token
                    self.word_idx += 1
                    print(f"add a new word: {token}")

    def glovemapping(self):
        # using GloVe for embedding word vectors
        glove_dict = {}
        with open('glove_path', 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i == 0: dims = len(line.split()) - 1
                parts = line.strip().split()
                word = parts[0]
                vec = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float)
                glove_dict[word] = vec
        vocab_size = len(self.word2idx)

        self.embedding_matrix = torch.randn(vocab_size, dims) * 0.1
        self.embedding_matrix[0] = torch.zeros(dims)
        for word, idx in self.word2idx.items():
            if word in glove_dict:
                self.embedding_matrix[idx] = glove_dict[word]
        del glove_dict

    def getDataLoader(self, split="train", batch_size=32, shuffle=True):
        # return specific dataloader
        if split == "train":
            dataset = TextDataset(self.train_data)
        elif split == "dev":
            dataset = TextDataset(self.dev_data)
        elif split == "test":
            dataset = TextDataset(self.test_data)
        else:
            raise ValueError("Unknown split: {}".format(split))
        return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=self.collate_fn)

    def collate_fn(self, batch):
        batch_size = len(batch)
        max_len = (max(len(sample["tokens"]) for sample in batch))
        word_idx = torch.zeros(batch_size, max_len, dtype=torch.long)  # word2idx[0] = PAD
        label_idx = torch.full((batch_size, max_len), fill_value=-100, dtype=torch.long) # labels of each word
        type_idx = torch.zeros(batch_size, max_len, dtype=torch.long)   # type of labels
        dtype_idx = torch.zeros(batch_size, max_len, dtype=torch.long)  # datatype of types
        class_labels = torch.zeros(batch_size, dtype=torch.long)        # SQL template of each sample
        for i, sample in enumerate(batch):
            seq_len = len(sample["tokens"])
            for j, token in enumerate(sample["tokens"]):
                word_idx[i, j] = self.word2idx.get(token, self.word2idx['UNK'])
            label_idx[i, :seq_len] = torch.tensor(sample["vars"], dtype=torch.long)
            type_idx[i, :seq_len] = torch.tensor(sample["type"], dtype=torch.long)
            dtype_idx[i, :seq_len] = torch.tensor(sample["dtype"], dtype=torch.long)
            class_labels[i] = sample["template"]
        return word_idx, label_idx, type_idx, dtype_idx, class_labels

In [65]:
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

In [27]:

with open('atis.json', 'r', encoding='utf-8') as df:
    dataset = json.load(df)
    for obj in dataset:
        for sentence in obj['sentences']:
            print(sentence)

{'text': 'list all the flights that arrive at airport_code0 from various cities', 'question-split': 'train', 'variables': {'airport_code0': 'MKE'}}
{'text': 'what flights from any city land at airport_code0', 'question-split': 'train', 'variables': {'airport_code0': 'MKE'}}
{'text': 'show me the flights into airport_code0', 'question-split': 'train', 'variables': {'airport_code0': 'DAL'}}
{'text': 'show me the flights arriving at airport_code0', 'question-split': 'train', 'variables': {'airport_code0': 'DAL'}}
{'text': 'list all the flights that arrive at airport_code0', 'question-split': 'train', 'variables': {'airport_code0': 'MKE'}}
{'text': 'list all the arriving flights at airport_code0', 'question-split': 'train', 'variables': {'airport_code0': 'MKE'}}
{'text': 'what flights land at airport_code0', 'question-split': 'train', 'variables': {'airport_code0': 'MKE'}}
{'text': 'show me the flights to airport_code0', 'question-split': 'train', 'variables': {'airport_code0': 'DAL'}}
{'t

In [11]:
var2type = {}
with open('atis-schema.csv', 'r', encoding='utf-8') as tf:
    print(f"Loading datatype of variables for additional information on learning...")
    next(tf)
    for line in tf:
        parts = line.replace(",", "").strip().split()
        var2type[parts[1].lower()] = parts[-1].lower()
        type_set = sorted(set(var2type.values()))
        dtype2idx = {t:i for i, t in enumerate(type_set)}
        idx2dtype = {i:t for t, i in dtype2idx.items()}
        print(f"Loading new datatype {parts[1].lower()} : {parts[-1].lower()}")
print(dtype2idx)
print(idx2dtype)

Loading datatype of variables for additional information on learning...
Loading new datatype aircraft_code : varchar(3)
Loading new datatype aircraft_description : varchar(50)
Loading new datatype manufacturer : varchar(30)
Loading new datatype basic_type : varchar(30)
Loading new datatype engines : int(11)
Loading new datatype propulsion : varchar(10)
Loading new datatype wide_body : varchar(3)
Loading new datatype wing_span : int(11)
Loading new datatype length : int(11)
Loading new datatype weight : int(11)
Loading new datatype capacity : int(11)
Loading new datatype pay_load : int(11)
Loading new datatype cruising_speed : int(11)
Loading new datatype range_miles : int(11)
Loading new datatype pressurized : varchar(3)
Loading new datatype - : -
Loading new datatype airline_code : varchar(2)
Loading new datatype airline_name : text
Loading new datatype note : text
Loading new datatype - : -
Loading new datatype airport_code : varchar(3)
Loading new datatype airport_name : text
Loadin

In [17]:
processor = atisDataProcessor(tags_file='atis-fields.txt', data_file='atis.json', type_file='atis-schema.csv', glove_path='glove.6B.50d.txt')

Tags Loading.....
Tags insert aircraft_code in 2
Tags insert aircraft_description in 3
Tags insert basic_type in 4
Tags insert capacity in 5
Tags insert cruising_speed in 6
Tags insert engines in 7
Tags insert length in 8
Tags insert manufacturer in 9
Tags insert pay_load in 10
Tags insert pressurized in 11
Tags insert propulsion in 12
Tags insert range_miles in 13
Tags insert weight in 14
Tags insert wide_body in 15
Tags insert wing_span in 16
Tags insert airline_code in 17
Tags insert airline_name in 18
Tags insert note in 19
Tags insert airport_code in 20
Tags insert airport_location in 21
Tags insert airport_name in 22
Tags insert country_name in 23
Tags insert minimum_connect_time in 24
Tags insert state_code in 25
Tags insert time_zone_code in 26
Tags insert airport_code in 26
Tags insert city_code in 27
Tags insert direction in 28
Tags insert miles_distant in 29
Tags insert minutes_distant in 30
Tags insert city_code in 30
Tags insert city_name in 31
Tags insert country_name in 

FileNotFoundError: [Errno 2] No such file or directory: 'glove_path'

In [19]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -------- ------------------------------- 2.6/12.8 MB 16.9 MB/s eta 0:00:01
     -------------------- ------------------- 6.6/12.8 MB 16.8 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 15.9 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 16.4 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
