# Sequence To SQL

Welcome to our project in the Advanced Natural Language Processing course

We try to build it with the data provided in https://github.com/salesforce/WikiSQL

Remove this: https://towardsdatascience.com/text-to-sql-learning-to-query-tables-with-natural-language-7d714e60a70d?gi=6b6c7e91e298

In [None]:
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm, trange
from torch import nn

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

# Data collection and Review
Clone the data from the WikiSQL git repository and install them.

Take a look inside the data

In [None]:
import json

def read_json_data_from_file(file: str):
  ret_data = []
  with open(file) as json_file:
      # Get next line from file
      lines = json_file.readlines()
      for line in tqdm(lines):
          if not line:
            break

          data = json.loads(line)
          ret_data.append(data)
  return ret_data

def convert_to_id_dict(data, id_key: str):
  ret_dict = {}
  for element in data:
    if id_key in element:
      ret_dict[element[id_key]] = element
    else:
      print(f'Element {element} doenst contain key {id_key}')
  return ret_dict

Lets see if we succesfully serialized the data into objects.

In [None]:
data_folder = "../data"

dev_req_data = read_json_data_from_file(f'{data_folder}/dev.jsonl')
dev_table_data = read_json_data_from_file(f'{data_folder}/dev.tables.jsonl')
 
print(f'We have {len(dev_req_data)} dev data with {len(dev_table_data)} tables.')
print(f'An example Request: ')
print(json.dumps(dev_req_data[0], indent=2))

### The fields represent the following:

* `phase`: the phase in which the dataset was collected. We collected WikiSQL in two phases.
* `question`: the natural language question written by the worker.
* `table_id`: the ID of the table to which this question is addressed.
sql: the SQL query corresponding to the question. This has the following *subfields:
  * `sel`: the numerical index of the column that is being selected. You can find the actual column from the table.
  * `agg`: the numerical index of the aggregation operator that is being used. You can find the actual operator from Query.agg_ops in lib/query.py.
  * `conds`: a list of triplets (column_index, operator_index, condition) where:
    * `column_index`: the numerical index of the condition column that is being used. You can find the actual column from the table.
    * `operator_index`: the numerical index of the condition operator that is being used. You can find the actual operator from Query.cond_ops in lib/query.py.
    * `condition`: the comparison value for the condition, in either string or float type.


## An example Table

In [None]:
print(json.dumps(dev_table_data[0], indent=2))

## Preprocess

The data is stored with indices but we need the actual column names so saturate the requests with the data

In [None]:
# Transform the data into a dictonary index by the id
dev_table_data_dict = convert_to_id_dict(dev_table_data, 'id')

In [None]:
# Get the preliminary data
# Maybe we want the other idexes also

In [None]:
def get_table_column(data_list, tables_dict):
  ret_list = []
  for element in data_list:
    current_table = tables_dict[element['table_id']]
    columns = current_table['header']
    # Replace the index
    element['columns'] = columns
    element['types'] = current_table['types']
    element['sql']['sel_name'] = columns[element['sql']['sel']]

    if 'page_title' in current_table:
        element['table_name'] = current_table['page_title']
    elif 'section_title' in current_table:
        element['table_name'] = current_table['section_title']
    elif 'caption' in current_table:
        element['table_name'] = current_table['caption']
    elif 'name' in current_table:
        element['table_name'] = current_table['name']

    # For the where conditions
    for cond in element['sql']['conds']:
      cond[0] = columns[cond[0]]
    ret_list.append(element)
  return ret_list


In [None]:
dev_req_data = read_json_data_from_file(f'{data_folder}/dev.jsonl')
dev_table_data = read_json_data_from_file(f'{data_folder}/dev.tables.jsonl')

dev_prep_req_data = get_table_column(dev_req_data, dev_table_data_dict)


print(f'Filed in with the Columns: ')
print(json.dumps(dev_prep_req_data[-2], indent=2))


In [None]:
# TODO figure out a good padding size or how to do padding correctly

def get_question_answers(request, tokenizer):
    input_list = []

    table_name = request['table_name'] #should be name not id
    space_token = ' '
    columns = request['columns']
    req_question = request['question'] # might need to be tokenized
    max_len = 0
    for i, col in enumerate(columns):
        col_type = request['types'][i] # infere type somehow
        column_representation = col_type + space_token + table_name + space_token + col
        embedding = tokenizer.encode_plus(
            column_representation,
            req_question,
            add_special_tokens=True,
        )
        if max_len < len(embedding['input_ids']):
            max_len = len(embedding['input_ids'])

    for i, col in enumerate(columns):
        col_type = request['types'][i] # infere type somehow
        column_representation = col_type + space_token + table_name + space_token + col
        embedding = tokenizer.encode_plus(
            column_representation,
            req_question,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_overflowing_tokens=True,
            return_attention_mask=True,
        )
        input_list.append(embedding)
    return input_list

In [None]:
pad_max_length = 50
def get_question_answers_def_length(request, tokenizer):
    input_list = []

    table_name = request['table_name'] #should be name not id
    space_token = ' '
    columns = request['columns']
    req_question = request['question'] # might need to be tokenized
    for i, col in enumerate(columns):
        col_type = request['types'][i] # infere type somehow
        column_representation = col_type + space_token + table_name + space_token + col
        embedding = tokenizer.encode_plus(
            column_representation,
            req_question,
            add_special_tokens=True,
            max_length=pad_max_length,
            padding='max_length',
            return_overflowing_tokens=True,
            return_attention_mask=True,
        )
        input_list.append(embedding)
    return input_list

# Create data loader that transformed the stuff

In [None]:
from enum import Enum
class Task(Enum):
    SELECT = 1

In [None]:
from torch.utils.data import Dataset, DataLoader

class WikiSQLDataset(Dataset):

    def __init__(self, requests, tokenizer, task: Task):
        self.requests = requests
        self.tokenizer = tokenizer
        self.task = task

    def __len__(self):
        return len(self.requests)

    def __getitem__(self, item):
        req = self.requests[item]
        _req_embeddings = get_question_answers(req, self.tokenizer)
        _input_ids = [req_embedding['input_ids'] for req_embedding in _req_embeddings]
        _token_type_ids = [req_embedding['token_type_ids'] for req_embedding in _req_embeddings]
        _attention_mask = [req_embedding['attention_mask'] for req_embedding in _req_embeddings]

        target = None
        if self.task == Task.SELECT:
            correct_sel_id = req['sql']['sel']
            target = torch.tensor([correct_sel_id], dtype=torch.long)

        return dict(
            request = self.requests[item],
            input_ids = torch.tensor(_input_ids),
            token_type_ids = torch.tensor(_token_type_ids),
            attention_mask = torch.tensor(_attention_mask),
            target = target
        )

def get_data_loader(data_type, tokenizer, task, batch_size):
    # TODO check if we can use DataLoader with batch size as done in the tutorial
    loaded_req = read_json_data_from_file(f'{data_folder}/{data_type}.jsonl')
    loaded_tables = read_json_data_from_file(f'{data_folder}/{data_type}.tables.jsonl')
    table_data_dict = convert_to_id_dict(loaded_tables, 'id')

    prep_req_data = get_table_column(loaded_req, table_data_dict)

    print(f'We have {len(loaded_req)} {data_type} data with {len(loaded_tables)} tables.')

    return DataLoader(
        WikiSQLDataset(requests = prep_req_data, tokenizer = tokenizer, task=task),
        batch_size=batch_size
    )

from transformers import BertTokenizer

data_folder = '../data'

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_data_loader = get_data_loader(data_type='dev', tokenizer = tokenizer, task = Task.SELECT, batch_size = 1)

data = next(iter(train_data_loader))
print(data.keys())
data

# Build the model

In this section, we will look into **contextual embeddings**. 

For this we use [**pretrained BERT**](https://www.aclweb.org/anthology/N19-1423.pdf) provided via [HuggingFace](https://huggingface.co/).

Let's first install the HuggingFace python package:

In [None]:
# Dp imports here
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
print(dev_prep_req_data[0])
for tokens in get_question_answers(dev_prep_req_data[0], tokenizer):
    print(tokens)
    print(tokenizer.convert_ids_to_tokens(tokens['input_ids']))

In [None]:
# Generall model configuration
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
sep_token = tokenizer.sep_token
cls_token = tokenizer.cls_token

In [None]:
# reference is hydranet https://arxiv.org/pdf/2008.04759.pdf


In [None]:
from transformers import BertModel

model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
req_embeddings = get_question_answers(dev_prep_req_data[-2], tokenizer)
print(req_embeddings)
input_ids = [req_embedding['input_ids'] for req_embedding in req_embeddings]
token_type_ids = [req_embedding['token_type_ids'] for req_embedding in req_embeddings]
attention_mask = [req_embedding['attention_mask'] for req_embedding in req_embeddings]

In [None]:
print(input_ids)
print(token_type_ids)
print(attention_mask)

In [None]:
input_ids_tensor = torch.tensor(input_ids)
token_type_ids_tensor = torch.tensor(token_type_ids)
attention_mask_tensor = torch.tensor(attention_mask)

print(input_ids_tensor.shape)
print(token_type_ids_tensor.shape)
print(attention_mask_tensor.shape)

outputs = model(
    input_ids=input_ids_tensor, # The tokens representing our input text.
    attention_mask=token_type_ids_tensor,
    token_type_ids=attention_mask_tensor
) # The segment IDs to differentiate question from answer_text
outputs

In [None]:
print(f'last hidden state  : {outputs.last_hidden_state.shape}')
print(f'pooled output layer: {outputs.pooler_output.shape}')

In [None]:
from torch import nn
import torch.nn.functional as F

# weight = torch.rand(model.config.hidden_size)
# F.sigmoid(F.linear(outputs.pooler_output, weight))

out = nn.Linear(model.config.hidden_size, 1)
out2 = torch.softmax(torch.sigmoid(out(outputs.pooler_output)), dim = 0)

torch.argmax(out2)

# Train

In [None]:
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup

EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(dev_prep_req_data) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
train_data_loader = get_data_loader(data_type='train', tokenizer = tokenizer, task = Task.SELECT, batch_size = 1)
val_data_loader = get_data_loader(data_type='dev', tokenizer = tokenizer, task = Task.SELECT, batch_size = 1)
# test_data_loader = get_data_loader('train', tokenizer)

In [None]:
class SelectionRanker(nn.Module):
    def __init__(self):
        super(SelectionRanker, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):

        outputs = self.bert(
            input_ids=input_ids.squeeze(0),
            attention_mask=attention_mask.squeeze(0),
            token_type_ids=token_type_ids.squeeze(0)
        )
        output = self.drop(outputs.pooler_output)
        linear = self.linear(output)
        softmax = torch.softmax(torch.sigmoid(linear), dim = 0)
        return torch.transpose(softmax, 0, 1)

selection_ranker = SelectionRanker()
selection_ranker = selection_ranker.to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in tqdm(data_loader):
        req = d["request"]
        input_ids = d["input_ids"].to(device)# [_input_ids.to(device) for _input_ids in d["input_ids"]]
        attention_mask = d["attention_mask"].to(device) # [_attention_mask.to(device) for _attention_mask in d["attention_mask"]]
        token_type_ids = d["token_type_ids"].to(device) # [_token_type_ids.to(device) for _token_type_ids in d["token_type_ids"]]
        targets = d["target"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids= token_type_ids,
        )
        pred_req_id = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets.squeeze(1))
        correct_predictions += 1 if pred_req_id == req['sql']['sel'] else 0
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)


def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in tqdm(data_loader):
            req = d["request"]
            input_ids = d["input_ids"].to(device)# [_input_ids.to(device) for _input_ids in d["input_ids"]]
            attention_mask = d["attention_mask"].to(device) # [_attention_mask.to(device) for _attention_mask in d["attention_mask"]]
            token_type_ids = d["token_type_ids"].to(device) # [_token_type_ids.to(device) for _token_type_ids in d["token_type_ids"]]
            targets = d["target"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids= token_type_ids,
            )

            pred_req_id = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets.squeeze(1))
            correct_predictions += 1 if pred_req_id == req['sql']['sel'] else 0
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    selection_ranker,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(train_data_loader)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    selection_ranker,
    val_data_loader,
    loss_fn,
    device,
    len(val_data_loader)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc


In [None]:
d = next(iter(train_data_loader))
input_ids = d["input_ids"].to(device)# [_input_ids.to(device) for _input_ids in d["input_ids"]]
attention_mask = d["attention_mask"].to(device) # [_attention_mask.to(device) for _attention_mask in d["attention_mask"]]
token_type_ids = d["token_type_ids"].to(device) # [_token_type_ids.to(device) for _token_type_ids in d["token_type_ids"]]
targets = d["target"].to(device)

print(input_ids_tensor.shape)
print(token_type_ids_tensor.shape)
print(attention_mask_tensor.shape)

outputs = selection_ranker(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids= token_type_ids,
)

loss_fn = nn.NLLLoss().to(device)

print(outputs.shape, outputs.squeeze(1).shape)
print(targets.shape, targets.squeeze(1).shape)

print(outputs.shape, outputs.squeeze(1))
print(targets.shape, targets.squeeze(1))
# I dont get why targets is shape [1, 1] and not [1] it is initilized the same way in the data loader

target = torch.tensor([2], dtype=torch.long).to(device)
print(target.shape, target)


loss = loss_fn(outputs, target)