In [1]:
import json
from collections import Counter, defaultdict
import pickle
from utils import *
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import argparse
from torchtext.vocab import GloVe

2023-03-20 10:13:52.097961: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0/lib64
2023-03-20 10:13:52.141382: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0/lib64
2023-03-20 10:13:52.141410: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1835] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
class Text2SQLDataset(Dataset):
    def __init__(self, file_path, data_prefix = "train"):
        self.file_path = file_path
        self.data = pd.read_excel(os.path.join(file_path, f"{data_prefix}_data.xlsx"))
        print("Dataset Length =", len(self.data))
        with open(os.path.join(file_path, "encoder.vocab"), "r") as file:
            vocab = file.readlines()
        self.encoder_vocab = vocab
        
        with open(os.path.join(file_path, "decoder.vocab"), "r") as file:
            vocab = file.readlines()
        self.decoder_vocab = vocab
        
        with open(os.path.join(file_path, "encoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(file_path, "encoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
            
        self.en_word2idx = word2idx
        self.en_idx2word = idx2word
        
        with open(os.path.join(file_path, "decoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(file_path, "decoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
            
        self.de_word2idx = word2idx
        self.de_idx2word = idx2word
        print("Encoder Vocab Size = {}, Decoder Vocab Size = {}".format(len(self.en_word2idx), len(self.de_word2idx)))
        
    def __len__(self):        
        return len(self.data)
    
    def __getitem__(self, idx):
#         print(idx, "\n")
        try:
            query = ["<sos>"]
            question = ["<sos>"]
            query = ["<sos>"] + tokenize_query(self.data.loc[idx, "query"]) + ["<eos>"]
            question =  ["<sos>"] + tokenize_question(self.data.loc[idx, "question"]) + ["<eos>"]

            query = [self.en_word2idx[q] if q in self.en_word2idx else self.en_word2idx["<unk>"] for q in query]
            question = [self.en_word2idx[q] if q in self.en_word2idx else self.en_word2idx["<unk>"] for q in question]

            sample = {'question': question, 'query': query}
        except:
            print(idx)
            
        return sample
    
def collate(batch):
    
    max_len_ques = max([len(sample['question']) for sample in batch])
    max_len_query = max([len(sample['query']) for sample in batch])
    
    ques_lens = torch.zeros(len(batch), dtype=torch.long)
    padded_ques = torch.zeros((len(batch), max_len_ques), dtype=torch.long)
    
    query_lens = torch.zeros(len(batch), dtype=torch.long)
    padded_query = torch.zeros((len(batch), max_len_query), dtype=torch.long)
    
    for idx in range(len(batch)):
        
        query = batch[idx]['query']
        question = batch[idx]['question']
        
        ques_len = len(question)
        query_len = len(query)
        ques_lens[idx] = ques_len
        query_lens[idx] = query_len
        
        padded_ques[idx, :ques_len] = torch.LongTensor(question)
        padded_query[idx, :query_len] = torch.LongTensor(query)
        
    return {'question': padded_ques, 'query': padded_query, 'ques_lens': query_lens, 'query_lens': query_lens}

train_dataset = Text2SQLDataset("./processed_data/", "train")

Dataset Length = 7754
Encoder Vocab Size = 2040, Decoder Vocab Size = 2342


In [15]:
SPECIAL_TOKENS = ["<pad>", "<unk>", "<sos>", "<eos>", "<num_value>", "<str_value>"]
SQL_KEYWORDS = ["t"+str(i+1) for i in range(10)] + [".", ",", "(", ")", "in", "not", "and", "between", "or", "where",
            "except", "union", "intersect",
            "group", "by", "order", "limit", "having","asc", "desc",
            "count", "sum", "avg", "max", "min",
           "<", ">", "=", "!=", ">=", "<=",
            "like",
            "distinct","*",
            "join", "on", "as", "select", "from"
           ] 
SQL_KEYWORDS = dict(zip(SQL_KEYWORDS, [10]*len(SQL_KEYWORDS)))
class GloveEmbeddings():
    def __init__(self, embed_dim, word2idx):
        self.embed_dim = embed_dim
        self.word2idx = word2idx
        self.special_tokens = SPECIAL_TOKENS
        self.vocab_size = len(word2idx)
    
    def get_embedding_matrix(self):
        # Load pre-trained GloVe embeddings
        glove = GloVe(name='6B', dim=self.embed_dim)
        embedding_matrix = torch.zeros((self.vocab_size, self.embed_dim))

        embedding_matrix[0] = torch.zeros(self.embed_dim)    # Padding token
        for i in range(1,len(SPECIAL_TOKENS)):            
            embedding_matrix[i] = torch.randn(self.embed_dim)    # Start-of-sentence token
            
        for k, v in self.word2idx.items():
            if k in SPECIAL_TOKENS:
                continue
            else:            
                if k in glove.stoi:
                    embedding_matrix[v] = torch.tensor(glove.vectors[glove.stoi[k]])
                else:
                    embedding_matrix[v] = embedding_matrix[1]
#                     print("unknown token", v)

        return embedding_matrix


class LSTMEncoder(nn.Module):
    def __init__(self, input_size, embed_dim, hidden_units=1024, num_layers=1, p = 0.5, bidirectional=False, embed_matrix=None):
        super(LSTMEncoder, self).__init__()
        self.input_size = input_size
        self.embed_dim = embed_dim
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.bidirectional = bidirectional
        self.embed_matrix = None
        if self.embed_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=0)
        else:
            self.embedding = nn.Embedding(input_size, self.embed_dim, padding_idx=0)
        self.LSTM = nn.LSTM(embed_dim, hidden_units, num_layers = num_layers, dropout=p, batch_first=True, bidirectional=bidirectional)
        
    def forward(self, x):
#         print("ENCODER INPUT SHAPE", x.shape)
        x = self.dropout(self.embedding(x))
#         print("ENCODER EMBEDDING SHAPE", x.shape)
        
        encoder_out, (ht, ct) = self.LSTM(x)        
#         print("ENCODER OUTPUT SHAPE: encoder_out, ht, ct", encoder_out.shape, ht.shape, ct.shape)
        
        return encoder_out, (ht, ct)
    
class LSTMDecoder(nn.Module):
    def __init__(self, input_size, embed_dim, hidden_units=1024, num_layers=1, p = 0.5):
        super(LSTMDecoder, self).__init__()
        self.input_size = input_size
        self.embed_dim = embed_dim
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, self.embed_dim, padding_idx=0)
        self.LSTM = nn.LSTM(embed_dim, hidden_units, num_layers = num_layers, dropout=p, batch_first=True)
        self.fc = nn.Linear(hidden_units, input_size)
        
    def forward(self, x, h0_c0):
#         print("|== Decoder Input Shape: x, h0_c0", x.shape, len(h0_c0), h0_c0[0].shape, h0_c0[1].shape)
        x = self.dropout(self.embedding(x))
#         print("|== Decoder Embeddings Shape: x", x.shape)
        x = x.unsqueeze(1)
#         print("|== Decoder Embeddings unsqueezed(0) Shape: x", x.shape)
        decoder_out, (ht, ct) = self.LSTM(x, h0_c0)
#         print("|== Decoder Output Shape Shape: decoder_out, ht, ct", decoder_out.shape, ht.shape, ct.shape)
        
        out = self.fc(decoder_out)
#         print("|== Decoder FC OUT Shape: out", out.shape)
        
        return out, (ht, ct)
    
class Seq2Seq(nn.Module):
    def __init__(self, args):
        super(Seq2Seq, self).__init__()
        self.args = args
        self.model_type = args.model_type
        self.embed_dim = args.embed_dim        
        self.encoder_hidden_units = args.en_hidden
        self.decoder_hidden_units = args.de_hidden
        self.encoder_num_layers = args.en_num_layers
        self.decoder_num_layers = args.de_num_layers
        self.processed_data = args.processed_data
        self.encoder_word2idx = self.get_encoder_word2idx()
        self.decoder_word2idx = self.get_decoder_word2idx()
        self.encoder_input_size = len(self.encoder_word2idx)
        self.decoder_input_size = len(self.decoder_word2idx)
        self.encoder = self.get_encoder()
        self.decoder = self.get_decoder()
        
        

    def get_encoder_word2idx(self):
        with open(os.path.join(self.processed_data, "encoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(self.processed_data, "encoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
            
#         self.en_word2idx = word2idx
#         self.en_idx2word = idx2word        
        
        return word2idx
    
    def get_decoder_word2idx(self):
        
        with open(os.path.join(self.processed_data, "decoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(self.processed_data, "decoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
            
#         self.de_word2idx = word2idx
#         self.de_idx2word = idx2word
        
        return word2idx
    
    def get_encoder(self):
        print("Loading GloVe embeddings...")
        glove = GloveEmbeddings(self.embed_dim, self.encoder_word2idx)
        embedding_matrix = glove.get_embedding_matrix()
        print("Loading Encoder...")
        encoder = LSTMEncoder(input_size = self.encoder_input_size, embed_dim = self.embed_dim, 
                              hidden_units=self.encoder_hidden_units, num_layers=self.encoder_num_layers, p = 0.3, bidirectional=False, embed_matrix=embedding_matrix)
        
        return encoder
    
    def get_decoder(self):
        
        if self.model_type == "Seq2Seq":
            print("Loading Seq2Seq LSTM Decoder...")
            decoder = LSTMDecoder(input_size = self.decoder_input_size, embed_dim = self.embed_dim, 
                              hidden_units=self.decoder_hidden_units, num_layers=self.decoder_num_layers, p = 0.3)
        
        elif self.model_type == "Seq2SeqAttn":
            pass
        else:
            pass
        return decoder
        
    def forward(self, question, query, tf_ratio=0.5):
        batch_size = question.shape[0]
        target_len = query.shape[1]
        
        _, (hidden, cell) = self.encoder(question)
        
        target_vocab_size = self.decoder_input_size
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
        
        x = query[:,0]
        for t in range(1, target_len):
            output, (hidden, cell) = self.decoder(x, (hidden, cell))
#             print("Seq2seq out shape", output.shape)
            output = output.squeeze(1)
            outputs[:,t,:] = output
            x = output.argmax(dim=1)
        
        return outputs
        
        
        

    
    


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = Seq2Seq(args).to(device)
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
schedulers = [
        optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1, last_epoch=- 1, verbose=False),
        optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, verbose=False)
        ]
scheduler =  schedulers[1]

train_loader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle=True, 
                          num_workers=args.num_workers, collate_fn=collate)
for epoch in range(args.epochs):
    print("Epoch:", epoch)
    for i, data in enumerate(train_loader):
    #     print(data['question'].shape, data['query'].shape, data['ques_lens'].shape, data['query_lens'].shape)
        optimizer.zero_grad()
        question = data['question'].to(device)
        query = data['query'].to(device)
        output = model(question, query)
    #     print("output and target", output.shape, query.shape)
        output = output.reshape(-1, output.shape[2])
        query = query.reshape(-1)    
    #     print("reshaped output and target", output.shape, query.shape)
        loss = criterion(output, query)
        loss.backward()
        
        optimizer.step()
        if i % 100 == 0:
            print(loss.item())
    scheduler.step()
#     break
    

cuda
Loading GloVe embeddings...
Loading Encoder...
Loading Seq2Seq LSTM Decoder...
Epoch: 0




7.7632012367248535
None
2.829655170440674
None
2.369840145111084
None
2.7263875007629395
None
2.661227226257324
None
Epoch: 1
2.6697356700897217
None
2.628690004348755
None
2.8142495155334473
None
2.4986259937286377
None
2.4327619075775146
None
Epoch: 2
2.422717332839966
None
2.607929229736328
None
2.367811918258667
None
2.3216230869293213
None
2.4766297340393066
None
Epoch: 3
2.3596885204315186
None
2.4122962951660156
None
2.5834128856658936
None
2.5238888263702393
None
2.4882805347442627
None
Epoch: 4
2.20483136177063
None
2.545891284942627
None
2.416046619415283
None
2.2803456783294678
None
2.310915470123291
None
Epoch: 5
2.8579797744750977
None
2.047302007675171
None
2.1798903942108154
None
2.186638116836548
None
2.6594316959381104
None
Epoch: 6
2.064645290374756
None
2.768460988998413
None
2.412048816680908
None
2.1769721508026123
None
2.1773533821105957
None
Epoch: 7
2.028174877166748
None
2.273200273513794
None
1.9562876224517822
None
1.6715636253356934
None
2.1585137844085693
N

In [133]:
!nvidia-smi

Mon Mar 20 10:11:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 470.63.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 5000     Off  | 00000000:C1:00.0 Off |                  Off |
| 33%   29C    P8    16W / 230W |   1375MiB / 16125MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
class ARGS():
    def __init__(self):
        self.model_type = "Seq2Seq"
        self.data_dir = "./data"
        self.batch_size = 16
        self.num_workers = 8
        self.epochs = 100
        self.en_hidden = 512
        self.de_hidden = 512
        self.en_num_layers = 1
        self.de_num_layers = 1
        self.embed_dim = 300
        self.processed_data = "./processed_data/"
        
args = ARGS()

def get_parser():
    """
    Generate a parameter parser
    """
    # parse parameters
    parser = argparse.ArgumentParser(description="Text2SQL")
    
    # model type
    parser.add_argument("--model_type", type=str, default="Seq2Seq", help="Select the model you want to run from [Seq2Seq, Seq2SeqAttn].")

    # path to data files.
    parser.add_argument("--data_dir", type=str, default="./data", help="Path to dataset directory.")

    # path to result files.
    parser.add_argument("--result_dir", type=str, default="./results", help="Path to dataset directory.")

    # path to model checkpoints.
    parser.add_argument("--checkpoint_dir", type=str, default="./checkpoints", help="Path to model checkpoints.")
    
    # path to model checkpoints.
    parser.add_argument("--processed_data", type=str, default="./processed_data", help="Path to processed data.")

    # batch size training
    parser.add_argument("--batch_size", type=int, default=16, help="Batch size to be used during training.")

    # number of workers for dataloader
    parser.add_argument("--num_workers", type=int, default=8, help="Number of workers used for dataloading.")

    # max number of epochs
    parser.add_argument("--epochs", type=int, default=100, help="Number of workers used for dataloading.")

    parser.add_argument("--en_hidden", type=int, default=512, help="Encoder Hidden Units")
    
    parser.add_argument("--de_hidden", type=int, default=512, help="Decoder Hidden Units")

    parser.add_argument("--en_num_layers", type=int, default=1, help="Number of lstm layers in encoder.")
    
    parser.add_argument("--de_num_layers", type=int, default=1, help="Number of lstm layers in decoder.")    
    
    parser.add_argument("--embed_dim", type=int, default=300, help="Embeddings dimension for both encoder and decoder.")

    return parser
# parser = get_parser()
# args = parser.parse_args()
# args.data_dir = os.path.relpath(args.data_dir)
# print(args)


'Seq2Seq'