In [9]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import transformers
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, ConcatDataset
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset
from functools import partial
from transformers import get_linear_schedule_with_warmup, AutoConfig 
from transformers import BartTokenizer,BartModel,BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model
from transformers import BartConfig
from transformers import AutoTokenizer
from transformers import AdamW
from torch.autograd import Variable
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
import socket
from os.path import basename
from functools import reduce
import re
import itertools
import json
from pathlib import Path
import glob
import sys
import subprocess
import argparse
import sys

torch.manual_seed(0)


  Referenced from: /Users/jakobtolstrup/opt/anaconda3/lib/python3.9/site-packages/torchvision/image.so
  Expected in: /Users/jakobtolstrup/opt/anaconda3/lib/python3.9/site-packages/torch/lib/libtorch_cpu.dylib
 in /Users/jakobtolstrup/opt/anaconda3/lib/python3.9/site-packages/torchvision/image.so
  warn(f"Failed to load image Python extension: {e}")


<torch._C.Generator at 0x7f7fa1366630>

In [10]:
print("my version of transformers is " + transformers.__version__)
print ("my version of pytorch is " + torch.__version__)
print("my version of pytorch_lightning is " + pl.__version__)

my version of transformers is 4.15.0
my version of pytorch is 1.10.0
my version of pytorch_lightning is 1.9.3


In [11]:
### States
test_state = False
tensorflow_active = True
use_gpu = False
train_state = False

In [12]:
class TextToGraphQLDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='train.json', block_size=102):
        'Initialization'
        super(TextToGraphQLDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        self.schema_ids = []
        root_path = './SPEGQL-dataset/'
        dataset_path = root_path + 'dataset/' + type_path

        schemas_path = root_path + 'Schemas/'
        schemas = glob.glob(schemas_path + '**/' + 'simpleSchema.json')

        self.max_len = 0
        self.name_to_schema = {}
        for schema_path in schemas:
           with open(schema_path, 'r', encoding='utf-8') as s:
            
             data = json.load(s)

             type_field_tokens = [ ['<t>'] + [t['name']] + ['{'] + [ f['name'] for f in t['fields']] + ['}'] + ['</t>'] for t in data['types']]
             type_field_flat_tokens = reduce(list.__add__, type_field_tokens)

             arguments = [a['name']  for a in data['arguments']]
             schema_tokens = type_field_flat_tokens + ['<a>'] + arguments + ['</a>']

             path = Path(schema_path)
             schema_name = basename(str(path.parent))

             self.name_to_schema[schema_name] = schema_tokens

        with open(dataset_path, 'r', encoding='utf-8') as f:
          data = json.load(f)

          for element in data:
            question_with_schema = 'translate English to GraphQL: ' + element['question']  + ' ' + ' '.join(self.name_to_schema[element['schemaId']])
            tokenized_s = tokenizer.encode_plus(question_with_schema,max_length=1024, padding=True, truncation=True, return_tensors='pt')
            self.source.append(tokenized_s)

            tokenized_t = tokenizer.encode_plus(element['query'],max_length=block_size, padding='max_length', truncation=True, return_tensors='pt')
            self.target.append(tokenized_t)
            self.schema_ids.append(element['schemaId'])

  def get_question_with_schema(self, question, schemaId):
        return 'translate English to GraphQL: ' + question  + ' ' + ' '.join(self.name_to_schema[schemaId])

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]['input_ids'].squeeze()
        target_ids = self.target[index]['input_ids'].squeeze()
        src_mask = self.source[index]['attention_mask'].squeeze()

        return { 
            'source_ids': source_ids,
                'source_mask': src_mask,
                'target_ids': target_ids,
                'target_ids_y': target_ids
                }

sys.modules["__main__"].TextToGraphQLDataset = TextToGraphQLDataset

In [13]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    dataset = TextToGraphQLDataset(tokenizer=tokenizer, type_path='train.json', block_size=102)

    length = dataset.__len__()
    item = dataset.__getitem__(0)
    print("TextToGraphQLDataset test done")

In [14]:
class MaskGraphQLDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='train.json', block_size=64):
        'Initialization'
        super(MaskGraphQLDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        path = './SPEGQL-dataset/dataset/' + type_path
        with open(path, "r", encoding="utf-8") as f:
          data = json.load(f)
          for example in data:

            utterance = example['query']
            encoded_source = tokenizer.encode(utterance, max_length=block_size, padding='max_length', truncation=True, return_tensors='pt').squeeze()
            token_count = encoded_source.shape[0]
            repeated_utterance = [encoded_source for _ in range(token_count)]
            for pos in range(1, token_count):
              encoded_source = repeated_utterance[pos].clone()
              target_id = encoded_source[pos].item()
              if target_id == tokenizer.eos_token_id:
                  break
              encoded_source[pos] = tokenizer.mask_token_id
              decoded_target = ''.join(tokenizer.convert_ids_to_tokens([target_id]))
              encoded_target = tokenizer.encode(decoded_target, return_tensors='pt', max_length=4, padding='max_length', truncation=True).squeeze()
              if encoded_target is not None and torch.numel(encoded_target) > 0:
                  self.target.append(encoded_target)
                  self.source.append(encoded_source)
              if torch.numel(encoded_target) > 0:
                  self.target.append(encoded_target)
                  self.source.append(encoded_source)


  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]
        target_id = self.target[index]
        return { 'source_ids': source_ids,
                'target_id': target_id}

In [15]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")

    special_tokens_dict = tokenizer.special_tokens_map # the issue could be here, might need to copy.
    special_tokens_dict['mask_token'] = '<mask>'
    special_tokens_dict['additional_special_tokens'] = ['<t>', '</t>', '<a>', '</a>']
    tokenizer.add_tokens(['{', '}', '<c>', '</c>'])
    tokenizer.add_special_tokens(special_tokens_dict)
    #model.resize_token_embeddings(len(tokenizer))
    print(tokenizer.mask_token)

    dataset = MaskGraphQLDataset(tokenizer=tokenizer, type_path='train.json', block_size=64)
    print("MaskGraphQLDataset test done")

In [16]:
class SpiderDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='train_spider.json', block_size=102):
        'Initialization'
        super(SpiderDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        spider_path = './spider/'
        path = spider_path + type_path
        # TODO open up tables.json
        # its a list of tables
        # group by db_id 
        # grab column name from column_names_original ( each column name is a list of two. and the 2nd index {1} is the column name )
        # grab table names from table_names (^ same as above )
        # concat both with the english question (table names + <c> + column names + <q> english question)
        # tokenize

        # Maybe try making making more structure 
        # in the concat by using primary_keys and foreign_keys 

        tables_path = spider_path + 'tables.json'

        with open(path, 'r') as f, open(tables_path, 'r') as t:
          databases = json.load(t)
          data = json.load(f)

          #groupby db_id 
          grouped_dbs = {}
          for db in databases:
            grouped_dbs[db['db_id']] = db
          # print(grouped_dbs)
          # end grop tables

          for element in data:
            db = grouped_dbs[element['db_id']]

            # tables_names = " ".join(db['table_names_original'])
            db_tables = db['table_names_original']

            # columns_names = " ".join([column_name[1] for column_name in db['column_names_original'] ])
            tables_with_columns = ''
            for table_id, group in itertools.groupby(db['column_names_original'], lambda x: x[0]):
              if table_id == -1:
                continue

              columns_names = " ".join([column_name[1] for column_name in group ])
              tables_with_columns += '<t> ' + db_tables[table_id] + ' <c> ' + columns_names + ' </c> ' + '</t> '


            # group columns with tables. 

            db_with_question = 'translate English to SQL: ' + element['question'] + ' ' + tables_with_columns
            # question_with_schema = 'translate English to GraphQL: ' + element['question']  + ' ' + ' '.join(self.name_to_schema[element['schemaId']]) + ' </s>'

            tokenized_s = tokenizer.batch_encode_plus([db_with_question],max_length=1024, padding='max_length', truncation=True,return_tensors='pt')
            # what is the largest example size?
            # the alternative is to collate
            #might need to collate
            self.source.append(tokenized_s)

            tokenized_t = tokenizer.batch_encode_plus([element['query']],max_length=block_size, padding='max_length', truncation=True,return_tensors='pt')
            self.target.append(tokenized_t)


  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]['input_ids'].squeeze()
        target_ids = self.target[index]['input_ids'].squeeze()
        src_mask = self.source[index]['attention_mask'].squeeze()
        return { 'source_ids': source_ids,
                'source_mask': src_mask,
                'target_ids': target_ids,
                'target_ids_y': target_ids}


# # In[38]:



In [17]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    dataset = SpiderDataset(tokenizer=tokenizer , type_path='train_spider.json', block_size=102)

    length = dataset.__len__()
    item = dataset.__getitem__(0)
    print("SpiderDataset test done")

In [18]:
class CoSQLMaskDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='cosql_train.json', block_size=64):
        'Initialization'
        super(CoSQLMaskDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        path = './cosql_dataset/sql_state_tracking/' + type_path
        with open(path, 'r', encoding='utf-8') as f:
          data = json.load(f)
          for element in data:
            for interaction in element['interaction']:
              # repeat the squence for the amount of tokens. 
              # loop through those sequences and replace a different token in each one. 
              # the target will be that token. 
              utterance = interaction['query']
              # tokens = utterance.split()
              encoded_source = tokenizer.encode(utterance, max_length=block_size, padding='max_length', truncation=True, return_tensors='pt').squeeze()
              token_count = encoded_source.shape[0]
              # print(encoded_source.shape)
              repeated_utterance = [encoded_source for _ in range(token_count)]
              for pos in range(1, token_count):
                encoded_source = repeated_utterance[pos].clone()
                target_id = encoded_source[pos].item()
                if target_id == tokenizer.eos_token_id:
                  break
                # encoded_source[pos] = tokenizer.mask_token_id
                # self.target.append(target_id)
                # self.source.append(encoded_source)

                encoded_source[pos] = tokenizer.mask_token_id
                decoded_target = ''.join(tokenizer.convert_ids_to_tokens([target_id]))
                encoded_target = tokenizer.encode(decoded_target, return_tensors='pt', max_length=4, padding='max_length', truncation=True).squeeze() # should always be of size 1
                self.target.append(encoded_target)
                self.source.append(encoded_source)

                # repeated_utterance[pos][pos] = target_token # so that the next iteration the previous token is correct

                
          

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]#['input_ids'].squeeze()
        target_id = self.target[index]#['input_ids'].squeeze()
        # src_mask = self.source[index]['attention_mask'].squeeze()
        return { 'source_ids': source_ids,
                'target_id': target_id}
                # 'source_mask': src_mask,
                # 'target_ids': target_ids,
                # 'target_ids_y': target_ids}

In [19]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")

    special_tokens_dict = tokenizer.special_tokens_map # the issue could be here, might need to copy.
    special_tokens_dict['mask_token'] = '<mask>'
    special_tokens_dict['additional_special_tokens'] = ['<t>', '</t>', '<a>', '</a>']
    tokenizer.add_tokens(['{', '}', '<c>', '</c>'])
    tokenizer.add_special_tokens(special_tokens_dict)
    #model.resize_token_embeddings(len(tokenizer))
    print(tokenizer.mask_token)

    dataset = CoSQLMaskDataset(tokenizer=tokenizer , type_path='cosql_train.json', block_size=64)

    length = dataset.__len__()
    item = dataset.__getitem__(0)
    print("CoSQLMaskDataset test done")

In [20]:
class T5MultiSPModel(pl.LightningModule):
  def __init__(self, hyperparams, task='denoise', test_flag='graphql', train_sampler=None, batch_size=2,temperature=1.0,top_k=50, top_p=1.0, num_beams=1 ):
    super(T5MultiSPModel, self).__init__()

    self.temperature = temperature
    self.top_k = top_k
    self.top_p = top_p
    self.num_beams = num_beams

    self.hyperparams = hyperparams

    self.task = task
    self.test_flag = test_flag
    self.train_sampler = train_sampler
    self.batch_size = batch_size
    if self.task == 'finetune':
      self.model = T5ForConditionalGeneration.from_pretrained('t5-base')
    else: 
      self.model = T5ForConditionalGeneration.from_pretrained('t5-base') # no output past? 

    self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
    
    self.criterion = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    self.add_special_tokens()

  def forward(
    self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
    ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def add_special_tokens(self):
    # new special tokens
    special_tokens_dict = self.tokenizer.special_tokens_map # the issue could be here, might need to copy.
    special_tokens_dict['mask_token'] = '<mask>'
    special_tokens_dict['additional_special_tokens'] = ['<t>', '</t>', '<a>', '</a>']
    self.tokenizer.add_tokens(['{', '}', '<c>', '</c>'])
    self.tokenizer.add_special_tokens(special_tokens_dict)
    self.model.resize_token_embeddings(len(self.tokenizer))

  def _step(self, batch):
    if self.task == 'finetune':
      pad_token_id = self.tokenizer.pad_token_id
      source_ids, source_mask, y = batch["source_ids"], batch["source_mask"], batch["target_ids"]
      # y_ids = y[:, :-1].contiguous()
      labels = y[:, :].clone()
      labels[y[:, :] == pad_token_id] = -100
      # attention_mask is for ignore padding on source_ids 
      # labels need to have pad_token ignored manually by setting to -100
      # todo check the ignore token for forward
      # seems like decoder_input_ids can be removed. 
      outputs = self(source_ids, attention_mask=source_mask, labels=labels,)

      loss = outputs[0]

    else: 
      y = batch['target_id']
      labels = y[:, :].clone()
      labels[y[:, :] == self.tokenizer.pad_token_id] = -100
      loss = self(
          input_ids=batch["source_ids"],
          labels=labels
      )[0]


    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)

    print(f'Validation step called, batch_idx: {batch_idx}, loss: {loss.item()}')

    return {"val_loss": loss}


  def on_validation_epoch_end(self, outputs=None):
    if not outputs:
        print("Empty outputs list.")
        return
    print("outputs " + str(outputs))
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    # if self.task == 'finetune':
    #   avg_acc = torch.stack([x["val_acc"] for x in outputs]).mean()
    #   tensorboard_logs = {"val_loss": avg_loss, "avg_val_acc": avg_acc}
    #   return {"progress_bar": tensorboard_logs, "log": tensorboard_logs}
    # else:
    tensorboard_logs = {"val_loss": avg_loss}
    return {'progress_bar': tensorboard_logs, 'log': tensorboard_logs }
    

  # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
  #   if self.trainer:
  #     xm.optimizer_step(optimizer)
  #   else:
  #     optimizer.step()
  #   optimizer.zero_grad()
  #   self.lr_scheduler.step()


  def configure_optimizers(self):
    t_total = len(self.train_dataloader()) * self.trainer.max_epochs * self.trainer.limit_train_batches
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        {"params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hyperparams.lr, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return [optimizer] #, [scheduler]

  def _generate_step(self, batch):
    generated_ids = self.model.generate(
        batch["source_ids"],
        attention_mask=batch["source_mask"],
        num_beams=self.num_beams,
        max_length=1000,
        temperature=self.temperature,
        top_k=self.top_k,
        top_p=self.top_p,
        length_penalty=1.0,
        early_stopping=True,
    )

    preds = [
        self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for g in generated_ids
    ]
    target = [
        self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for t in batch["target_ids"]
    ]
    return (preds, target)

  def test_step(self, batch, batch_idx):
    preds, target = self._generate_step(batch)
    loss = self._step(batch)
    if self.test_flag == 'graphql':
      accuracy = exact_match.exact_match_accuracy(preds,target)
      return {"test_loss": loss, "test_accuracy": torch.tensor(accuracy)}
    else: 
      return {"test_loss": loss, "preds": preds, "target": target }

  # def test_end(self, outputs):
  #   return self.validation_end(outputs)


  def test_epoch_end(self, outputs):
    avg_loss = torch.stack([x["test_loss"] for x in outputs]).mean()
    
    if self.test_flag == 'graphql':
      avg_acc = torch.stack([x["test_accuracy"] for x in outputs]).mean()
      tensorboard_logs = {"test_loss": avg_loss, "test_acc": avg_acc}
      return {"progress_bar": tensorboard_logs, "log": tensorboard_logs}

    else:
      output_test_predictions_file = os.path.join(os.getcwd(), "test_predictions.txt")
      with open(output_test_predictions_file, "w+") as p_writer:
          for output_batch in outputs:
              p_writer.writelines(s + "\n" for s in output_batch["preds"])
          p_writer.close()
      tensorboard_logs = {"test_loss": avg_loss}
      return {"progress_bar": tensorboard_logs, "log": tensorboard_logs}

  def prepare_data(self):
    if self.task == 'finetune':
      self.train_dataset_g = TextToGraphQLDataset(self.tokenizer)
      self.val_dataset_g = TextToGraphQLDataset(self.tokenizer, type_path='dev.json')
      self.test_dataset_g = TextToGraphQLDataset(self.tokenizer, type_path='dev.json')

      self.train_dataset_s = SpiderDataset(self.tokenizer)
      self.val_dataset_s = SpiderDataset(self.tokenizer, type_path='dev.json')
      self.test_dataset_s = SpiderDataset(self.tokenizer, type_path='dev.json')

      self.train_dataset = ConcatDataset([self.train_dataset_g,self.train_dataset_s])
      self.val_dataset = ConcatDataset([self.val_dataset_g, self.val_dataset_s])
      # self.test_dataset = ConcatDataset([test_dataset_g, test_dataset_s])
      if self.test_flag == 'graphql':
        self.test_dataset = self.test_dataset_g
      else:
        self.test_dataset = self.test_dataset_s
      
    else:
      train_dataset_g = MaskGraphQLDataset(self.tokenizer)
      val_dataset_g = MaskGraphQLDataset(self.tokenizer, type_path='dev.json')

      train_dataset_s = CoSQLMaskDataset(self.tokenizer)
      val_dataset_s = CoSQLMaskDataset(self.tokenizer, type_path='cosql_dev.json')

      self.train_dataset = ConcatDataset([train_dataset_g, train_dataset_s])
      self.val_dataset = ConcatDataset([val_dataset_g,val_dataset_s])

  @staticmethod
  def custom_collate_fn(batch):
    keys = batch[0].keys()
    collated_batch = {}

    for key in keys:
        if key in ['source_ids', 'target_ids']:
            max_length = max([len(sample[key]) for sample in batch])
            padded_tensors = [torch.cat([sample[key], torch.zeros(max_length - len(sample[key]), dtype=torch.long)], dim=0) for sample in batch]
            collated_batch[key] = torch.stack(padded_tensors, dim=0)
        else:
            max_length = max([len(sample[key]) for sample in batch])
            padded_tensors = [torch.cat([sample[key], torch.zeros(max_length - len(sample[key]), dtype=torch.long)], dim=0) for sample in batch]
            collated_batch[key] = torch.stack(padded_tensors, dim=0)

    return collated_batch

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.custom_collate_fn, num_workers=0)

  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self.custom_collate_fn, num_workers=0)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=self.custom_collate_fn, num_workers=0)

In [21]:

# %load_ext tensorboard
%reload_ext tensorboard

%tensorboard --logdir lightning_logs/



In [22]:
hyperparams = argparse.Namespace(**{'lr': 0.0004365158322401656}) # for 3 epochs

# # system = ConvBartSystem(dataset, train_sampler, batch_size=2)
system = T5MultiSPModel(hyperparams,batch_size=32)
print("We initialize the T5MultiSPModel(hyperparams,batch_size=32)")

We initialize the T5MultiSPModel(hyperparams,batch_size=32)


In [23]:
# Initialize the logger
logger = TensorBoardLogger("lightning_logs/")
# Pass the logger to the Trainer
trainer = pl.Trainer(logger=logger)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [24]:
if os.path.exists('model_weights.pth'):
    system.load_state_dict(torch.load('model_weights.pth'))
    print("model weights loaded from model_weights.pth")

else:
    # If the weights file doesn't exist, train the model and save the weights after training
    print("lets train this model!")
    if (use_gpu):
      trainer = Trainer(accelerator='gpu', max_epochs=1, log_every_n_steps=1, limit_train_batches=0.2, gpus=1)
    else:
      trainer = Trainer(max_epochs=1, log_every_n_steps=1, limit_train_batches=0.2)
    trainer.fit(system)
    torch.save(system.state_dict(), 'model_weights.pth')

model weights loaded from model_weights.pth


In [25]:
system.prepare_data()

In [26]:
inputs = system.val_dataset[0]
system.tokenizer.decode(inputs['source_ids'])

if(use_gpu == True):
  system.model = system.model.cuda()
else:
  system.model = system.model.cpu()
generated_ids = system.model.generate(inputs['source_ids'].unsqueeze(0), num_beams=5, repetition_penalty=1.0, max_length=56, early_stopping=True)
# # # summary_text = system.tokenizer.decode(generated_ids[0])

hyps = [system.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]

print("hyps")
print(hyps)

hyps
['{']


In [27]:
if os.path.exists('fine_tuned_model_weights.pth'):
    # Load the model weights if the file exists
  print("Model is allready fine-tuned, loading weights...")
  system.load_state_dict(torch.load('fine_tuned_model_weights.pth'))
  print("fine_tuned_model_weights.pth loaded")

else:
  print("Let's fine-tune this model!")
  if(use_gpu):
    trainer = Trainer(gpus=1, max_epochs=5, progress_bar_refresh_rate=1, val_check_interval=0.5)
  else:
    trainer = Trainer(max_epochs=5, progress_bar_refresh_rate=1, val_check_interval=0.5)
  trainer.fit(system)
  torch.save(system.state_dict(), 'fine_tuned_model_weights.pth')
  
from pytorch_lightning.callbacks import ModelCheckpoint

Model is allready fine-tuned, loading weights...
fine_tuned_model_weights.pth loaded


# Evaluation

In [28]:
def predict(prompt, schemaId):

    if system.train_dataset_g.name_to_schema[schemaId] is not None:
        input_string = system.train_dataset_g.get_question_with_schema(prompt, schemaId)
    elif system.dev_dataset.name_to_schema[schemaId] is not None:
        input_string = system.val_dataset_g.get_question_with_schema(prompt, schemaId)
    #print(input_string)

    inputs = system.tokenizer.batch_encode_plus([input_string], max_length=1024, return_tensors='pt')['input_ids']
    #print(inputs.shape)

    if(use_gpu == True):
      generated_ids = system.model.generate(inputs.cuda(), num_beams=3, repetition_penalty=1.0, max_length=1000, early_stopping=True)
    else:
      generated_ids = system.model.generate(inputs, num_beams=3, repetition_penalty=1.0, max_length=1000, early_stopping=True)
    hyps = [system.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
    dict_res = {"prediction": hyps[0]}
    return dict_res

In [31]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping to next line
pd.set_option('display.max_colwidth', None)  # Show the full content of each cell

import pandas as pd

#Create a datafame called test_df from semantic_matching_queries.csv

test_df = pd.read_csv('semantic_match_test_queries.csv')

test_df.head()

# for every row in the dataframe call are_queries_semantically_same with queries and queries_shifted_order. Create a new column called result and set it to the result of the function call

test_df['result'] = test_df.apply(lambda row: are_queries_semantically_same(row['queries'], row['queries_shifted_order']), axis=1)

test_df

Unnamed: 0,queries,queries_shifted_order,result
0,query { hiring { employee_id is_full_time shop_id start_from } },query { hiring { employee_id shop_id is_full_time start_from } },True
1,query { teacher ( order_by : { age : asc } ) { name } },query { teacher ( order_by : { age : asc } ) { name } },True
2,query { pets ( where : { pet_age : { _gt : 1 } } ) { petid weight } },query { pets ( where : { pet_age : { _gt : 1 } } ) { weight petid } },True
3,query { singer_aggregate { aggregate { count } } },query { singer_aggregate { aggregate { count } } },True
4,query { singer { birth_year citizenship } },query { singer { citizenship birth_year } },True
5,query { ref_template_types { template_type_code template_type_description } },query { ref_template_types { template_type_description template_type_code } },True
6,"query { airports ( where : { city : { _eq : ""Anthony"" } } ) { airportcode airportname }}","query { airports ( where : { city : { _eq : ""Anthony"" } } ) { airportname airportcode }}",True
7,"query { singer ( where : { song_name : { _like : ""%Hey%"" } } ) { name country } }","query { singer ( where : { song_name : { _like : ""%Hey%"" } } ) { country name } }",True
8,query { players ( order_by : { birth_date : asc } ) { first_name last_name } },query { players ( order_by : { birth_date : asc } ) { last_name first_name } },True
9,"query { battle ( where : { bulgarian_commander : { _neq : ""Boril"" } } ) { name result } }","query { battle ( where : { bulgarian_commander : { _neq : ""Boril"" } } ) { result name } }",True


In [32]:
import json
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

def custom_test(dataset_path):
    # Load the dev dataset
    with open(dataset_path, "r") as f:
        dev_data = json.load(f)

    exact_match = []
    semantic_match = []

    for example in dev_data:
        prompt = example["question"]
        schemaId = example["schemaId"]
        query = example["query"]

        prediction = predict(prompt, schemaId)["prediction"]

        exact_match.append(1 if prediction == query else 0)
        semantic_match.append(1 if are_queries_semantically_same(prediction, query) else 0)

    def calculate_metrics(match_results):
        accuracy = accuracy_score(match_results, np.ones(len(match_results)))
        f1 = f1_score(match_results, np.ones(len(match_results)))
        precision = precision_score(match_results, np.ones(len(match_results)))
        recall = recall_score(match_results, np.ones(len(match_results)))
        return accuracy, f1, precision, recall

    # Calculate evaluation metrics for exact match
    exact_accuracy, exact_f1, exact_precision, exact_recall = calculate_metrics(exact_match)

    # Calculate evaluation metrics for semantic match
    semantic_accuracy, semantic_f1, semantic_precision, semantic_recall = calculate_metrics(semantic_match)

    print("Exact match:")
    print(f"Accuracy: {exact_accuracy:.4f}")
    
    print("\nSemantic match:")
    print(f"Accuracy: {semantic_accuracy:.4f}")



In [33]:
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np

def custom_test_df(df):
    exact_match = []
    semantic_match = []

    for index, row in df.iterrows():
        prompt = row["question"]
        schemaId = row["schemaId"]
        query = row["query"]

        prediction = predict(prompt, schemaId)["prediction"]

        exact_match.append(1 if prediction == query else 0)
        df.loc[index, 'exact_match'] = 1 if prediction == query else 0

        semantic_match.append(1 if are_queries_semantically_same(prediction, query) else 0)
        df.loc[index, 'semantic_match'] = 1 if are_queries_semantically_same(prediction, query) else 0

        # Add predicted query to the dataframe
        df.loc[index, 'predicted_query'] = prediction

    def calculate_metrics(match_results):
        accuracy = accuracy_score(match_results, np.ones(len(match_results)))
        return accuracy

    # Calculate evaluation metrics for exact match
    exact_accuracy = calculate_metrics(exact_match)

    # Calculate evaluation metrics for semantic match
    semantic_accuracy = calculate_metrics(semantic_match)

    return exact_accuracy, semantic_accuracy


In [34]:
system.task = 'finetune'
system.prepare_data()

In [35]:
hardcoded_schemaId = "battle_death"
hardcoded_prompt = "How many ships ended up being not 'Captured'?"

result = predict(hardcoded_prompt, hardcoded_schemaId)
print("this is the result")
print(result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


this is the result
{'prediction': 'query { ship_aggregate ( where : { tonnage : { _lt : "Captured" } } ) { aggregate { count } } }'}


In [36]:
dev_dataset_path = 'SPEGQL-dataset/dataset/dev.json'

#custom_test(dev_dataset_path)


In [38]:
#Create a dataframe from dev_df.csv

import pandas as pd

dev_df = pd.read_csv('dev_df.csv')

# Add two empty columns to the dataframe: exact match and semantic match

dev_df['exact_match'] = ""
dev_df['semantic_match'] = ""

#custom_test_df(dev_df)

# Save the dataframe to a csv file for later use. Call it dev_df__with_results.csv

# dev_df.to_csv('dev_df_with_results.csv')

# Create a dataframe from dev_df with 5% of the data

# dev_df_20 = dev_df.sample(frac=0.02)

custom_test_df(dev_df)

(0.3819444444444444, 0.39814814814814814)

In [39]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping to next line
pd.set_option('display.max_colwidth', None)  # Show the full content of each cell

# How many rows are in the dataframe?

# len(dev_df_20.index)

# # How many exact matches are there?

# dev_df_20['exact_match'].sum()

# # How many semantic matches are there?

# dev_df_20['semantic_match'].sum()

dev_df.to_csv('dev_df_with_results.csv')


In [None]:
# I have this function that predicts the corresponding query to a question.
# I pass this data frame to the function. The data frame consists of the following columns:

# I wish to know how well it performs on the the different properties: question_length_bucket, nesting level, num_args, schema length, schema complexity
# Additionally, I wish to know how well it performs on the interactions of these properties. 


In [None]:
# system.num_beams = 3
# system.test_flag = 'graphql'
# system.prepare_data()
# trainer.test(system)

Testing: 0it [00:00, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/jakobtolstrup/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/jakobtolstrup/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TextToGraphQLDataset' on <module '__main__' (built-in)>


In [56]:
from graphql import parse
from graphql.language.ast import ObjectValueNode, StringValueNode

def are_queries_semantically_equivalent(query1, query2):
    def extract_fields(selection_set):
        fields = {}
        for field in selection_set.selections:
            if hasattr(field, "selection_set") and field.selection_set:
                fields[field.name.value] = extract_fields(field.selection_set)
            else:
                fields[field.name.value] = None
        return fields

    def sort_query_fields(query_dict):
        for key, value in query_dict.items():
            if isinstance(value, dict):
                query_dict[key] = sort_query_fields(value)
        return dict(sorted(query_dict.items()))

    def normalize_query(query):
        return ''.join(query.split())

    def extract_arguments(normalized_query):
        args_start = normalized_query.find("(")
        args_end = normalized_query.rfind(")")
        if args_start == -1 or args_end == -1:
            return None
        return normalized_query[args_start+1:args_end]

    normalized_query1 = normalize_query(query1)
    normalized_query2 = normalize_query(query2)

    # Check if the arguments are identical
    if extract_arguments(normalized_query1) != extract_arguments(normalized_query2):
        return False

    ast1 = parse(query1)
    ast2 = parse(query2)

    query1_dict = extract_fields(ast1.definitions[0].selection_set)
    query2_dict = extract_fields(ast2.definitions[0].selection_set)

    sorted_query1 = sort_query_fields(query1_dict)
    sorted_query2 = sort_query_fields(query2_dict)

    return sorted_query1 == sorted_query2


In [57]:
original_queries = [
    'query { matches_aggregate { aggregate { count } } }',
    'query { cartoon ( order_by : { title : asc } ) { title } }', 
    'query { highschooler ( where : { name : { _eq : \"Kyle\" } } ) { id } }',
    'query { stadium_aggregate { aggregate { avg { capacity } max { capacity } } } }',
    'query { flights ( where : { destairport : { _eq : "APG" } } ) { flight no } }',
    'query { singer ( where : { citizenship : { _neq : \"France\" } } ) { name } }',
    'query { paragraphs ( where : { paragraph_text : { _eq : \"Korea\" } } ) { other_details } }',
    'query { people ( where : { nationality : { _eq : \"Russia\" } } ) { name } }',
    'query { documents ( where : { template : { template_type_code : { _eq : \"BK\" } } } ) { document_name } }',
    'query { ship_aggregate ( where : { disposition_of_ship : { _eq : \"Captured\" } } ) { aggregate { count } } }',
    'query { flights ( where : { airlineByAirline : { airline : { _eq : \"United Airlines\" } } } ) { flightno } }',
    'query { votes_aggregate ( where : { state : { _eq : \"CA\" } } ) { aggregate { max { created } } } }',
    'query { visitor ( limit : 1 , order_by : { visits_aggregate : { max : { num_of_ticket : desc_nulls_last } } } ) { name age } }',
    'query { templates ( where : { ref_template_type : { template_type_description : { _eq : \"Presentation\" } } } ) { template_id } }',
    'query { templates ( where : { _or : [ { template_type_code : { _eq : "PP" } } , { template_type_code : { _eq : "PPT" } }] } ) { template_id } }',
    'query { rankings ( limit : 1 , order_by : { tours : desc_nulls_last } ) { player { country_code first_name } } }',
    'query { cars_data ( limit : 1 , order_by : { accelerate : asc } , where : { car_name : { model : { _eq : \"volvo\" } } } ) { cylinders } }',
    'query { singer_aggregate ( where : { country : { _eq : \"France\" } } ) { aggregate { avg { age } min { age } max { age } } } }',
    'query { singer ( where : { song_name : { _like : \"%Hey%\" } } ) { name country } }',
    'query { teacher ( where : { _or : [ { age : { _eq : \"32\" } } , { age : { _eq : \"33\" } } ] } ) { name } }',
    'query { teacher ( where : { course_arranges : { course : { course : { _eq : \"Math\" } } } } ) { name } }',
    'query { documents ( where : { document_name : { _eq : \"Robbin CV\" } } ) { document_id template_id document_description } }',
    'query { airlines_aggregate ( where : { _and : { airline : { _eq : \"United Airlines\" } , flights : { destairport : { _eq : \"ASY\" } } } } ) { aggregate { count } } }',
    'query { museum_aggregate ( where : { _or : [ { open_year : { _gt : \"2013\" } } , { open_year : { _lt : \"2008\" } } ] } ) { aggregate { count } } }'
]

semantically_equivalent_original_queries = [
    'query { matches_aggregate { aggregate { count } } }',
    'query{cartoon(order_by:{title:asc}){title}}',
    'query { highschooler ( where : { name : { _eq : \"Kyle\" } } ){ id } }',
    'query{stadium_aggregate{aggregate{avg{capacity} max{capacity}}}}',
    'query { flights ( where : { destairport : { _eq : \"APG\" } } ) { flight no } }',
    'query{singer(where:{citizenship:{_neq:\"France\"}}){name}}',
    'query { paragraphs ( where : { paragraph_text : { _eq : \"Korea\" } } ) { other_details } }',
    'query{people(where:{nationality:{_eq:\"Russia\"}}){name}}',
    'query { documents ( where : { template : { template_type_code : { _eq : \"BK\" } } } ) { document_name } }',
    'query{ship_aggregate(where:{disposition_of_ship:{_eq:\"Captured\"}}){aggregate{count}}}',
    'query { flights ( where : { airlineByAirline : { airline : { _eq : \"United Airlines\" } } } ) { flightno } }',
    'query{votes_aggregate(where:{state:{_eq:\"CA\"}}){aggregate{max{created}}}}',
    'query { visitor ( limit : 1 , order_by : { visits_aggregate : { max : { num_of_ticket : desc_nulls_last } } } ) { age name } }',
    'query{templates(where:{ref_template_type:{template_type_description:{_eq:\"Presentation\"}}}){template_id}}',
    'query { templates ( where : { _or : [ { template_type_code : { _eq : \"PP\" } } , { template_type_code : { _eq : \"PPT\" } }] } ) { template_id } }',
    'query{rankings(limit:1,order_by:{tours:desc_nulls_last}){player{first_name country_code}}}',
    'query { cars_data ( limit : 1 , order_by : { accelerate : asc } , where : { car_name : { model : { _eq : \"volvo\" } } } ) { cylinders } }',
    'query{singer_aggregate(where:{country:{_eq:\"France\"}}){aggregate{avg{age}min{age}max{age}}}}',
    'query { singer ( where : { song_name : { _like : \"%Hey%\" } } ) { name country } }',
    'query{teacher(where:{_or:[{age:{_eq:\"32\"}},{age:{_eq:\"33\"}}]}){name}}',
    'query { teacher ( where : { course_arranges : { course : { course : { _eq : \"Math\" } } } } ) { name } }',
    'query{documents(where:{document_name:{_eq:\"Robbin CV\"}}){ document_description document_id template_id }}',
    'query { airlines_aggregate ( where : { _and : { airline : { _eq : \"United Airlines\" } , flights : { destairport : { _eq : \"ASY\" } } } } ) { aggregate { count } } }',
	'query { museum_aggregate ( where : { _or : [ { open_year : { _gt : \"2013\" } } , { open_year : { _lt : \"2008\" } } ] } ) { aggregate { count } } }'
]

original_queries_expect_failure = [
    'query { students ( limit : 1 , order_by : { date_left : asc } ) { first_name middle_name last_name } }',
    'query { students ( limit : 1 , order_by : { date_left : asc } ) { first_name middle_name last_name } }',
    'query { cartoon ( order_by : { title : asc } ) { title } }',
    'query { cartoon ( order_by : { title : asc } ) { title } }',
    'query { countrylanguage_aggregate ( where : { _and : { country : { indepyear : { _lt : \"1930\" } } , isofficial : { _eq : \"T\" } } } ) { aggregate { count } } }',
    'query { countrylanguage ( where : { language : { _neq : \"English\" } } , distinct_on : countrycode ) { countrycode } }',
    'query { matches_aggregate ( where : { _and : { winner_hand : { _eq : \"L\" } , tourney_name : { _eq : \"WTA Championships\" } } } ) { aggregate { count } } }',
    'query { matches_aggregate ( where : { _and : { winner_hand : { _eq : \"L\" } , tourney_name : { _eq : \"WTA Championships\" } } } ) { aggregate { count } } }',
    'query { documents_aggregate { aggregate { count } } }',
    'query { documents { document_id document_name document_description } }',
    'query { people ( order_by : { name : asc } ) { name birth_date } }',
    'query { ref_template_types ( where : { template_type_code : { _eq : \"AD\" } } ) { template_type_description } }',
    'query { countrylanguage ( where : { _and : { country : { headofstate : { _eq : \"Beatrix\" } } , isofficial : { _eq : \"T\" } } } ) { language } }',
    'query { templates ( where : { _or : [ { template_type_code : { _eq : \"PP\" } } , { template_type_code : { _eq : \"PPT\" } } ] } ) { template_id } }',
]

revised_queries_expect_failure = [
    'query { students ( limit : 1 , order_by : { date_left : desc } ) { first_name middle_name last_name } }',
    'query { students ( limit : 2 , order_by : { date_left : asc } ) { first_name middle_name last_name } }',
    'query { cartoon ( order_by : { title : desc } ) { title } }',
    'query { cartoon { title } }',
    'query { countrylanguage_aggregate ( where : { _and : { country : { indepyear : { _gt : \"1930\" } } , isofficial : { _eq : \"T\" } } } ) { aggregate { count } } }',
    'query { countrylanguage ( where : { language : { _neq : \"French\" } } , distinct_on : countrycode ) { countrycode } }',
    'query { matches_aggregate ( where : { _and : { winner_hand : { _eq : \"R\" } , tourney_name : { _eq : \"WTA Championships\" } } } ) { aggregate { count } } }',
    'query { matches_aggregate ( where : { _and : { winner_hand : { _eq : \"L\" } , tourney_name : { _eq : \"Australian Open\" } } } ) { aggregate { count } } }',
    'query { documents_aggregate { aggregate { sum } } }',
    'query { documents { document_id document_name } }',
    'query { people ( order_by : { birth_date : desc } ) { name email } }',
    'query { ref_template_types ( where : { template_type_code : { _neq : "AD" } } ) { template_type_code } }',
    'query { countrylanguage ( where : { _or : [ { country : { headofstate : { eq : \"Beatrix\" } } }, { isofficial : { _neq : \"T\" } } ] } ) { language } }',
    'query { templates ( where : { _and : [ { template_type_code : { _neq : "PP" } } , { template_type_code : { _neq : "PPT" } } ] } ) { template_id template_type_code } }'
]

def test_queries(original_queries, revised_queries):
    total_tests = len(original_queries)
    passed_tests = 0

    for i in range(total_tests):
        original_query = original_queries[i]
        revised_query = revised_queries[i]

        if are_queries_semantically_equivalent(original_query, revised_query):
            print(f"Test {i + 1}: PASSED")
            passed_tests += 1
        else:
            print(f"Test {i + 1}: FAILED")

    print(f"\nTotal tests: {total_tests}")
    print(f"Passed tests: {passed_tests}")
    print(f"Failed tests: {total_tests - passed_tests}")

test_queries(original_queries, semantically_equivalent_original_queries)
#test_queries(original_queries_expect_failure, revised_queries_expect_failure)


Test 1: PASSED
Test 2: PASSED
Test 3: PASSED
Test 4: PASSED
Test 5: PASSED
Test 6: PASSED
Test 7: PASSED
Test 8: PASSED
Test 9: PASSED
Test 10: PASSED
Test 11: PASSED
Test 12: PASSED
Test 13: PASSED
Test 14: PASSED
Test 15: PASSED
Test 16: PASSED
Test 17: PASSED
Test 18: PASSED
Test 19: PASSED
Test 20: PASSED
Test 21: PASSED
Test 22: PASSED
Test 23: PASSED
Test 24: PASSED

Total tests: 24
Passed tests: 24
Failed tests: 0
