In [24]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import transformers
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, ConcatDataset
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset
from functools import partial
from transformers import get_linear_schedule_with_warmup, AutoConfig 
from transformers import BartTokenizer,BartModel,BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model
from transformers import BartConfig
from transformers import AutoTokenizer
from transformers import AdamW
from torch.autograd import Variable
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
import socket
from os.path import basename
from functools import reduce
import re
import itertools
import json
from pathlib import Path
import glob
import sys
import subprocess
import argparse
import sys

torch.manual_seed(0)


<torch._C.Generator at 0x7fa0282fd0d0>

In [25]:
print("my version of transformers is " + transformers.__version__)
print ("my version of pytorch is " + torch.__version__)
print("my version of pytorch_lightning is " + pl.__version__)

my version of transformers is 4.15.0
my version of pytorch is 1.10.0
my version of pytorch_lightning is 1.9.3


In [26]:
### States
test_state = True
tensorflow_active = True
use_gpu = False

In [27]:
class TextToGraphQLDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='train.json', block_size=102):
        'Initialization'
        super(TextToGraphQLDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        self.schema_ids = []
        root_path = './SPEGQL-dataset/'
        dataset_path = root_path + 'dataset/' + type_path

        schemas_path = root_path + 'Schemas/'
        schemas = glob.glob(schemas_path + '**/' + 'simpleSchema.json')

        self.max_len = 0
        self.name_to_schema = {}
        for schema_path in schemas:
           with open(schema_path, 'r', encoding='utf-8') as s:
            
             data = json.load(s)

             type_field_tokens = [ ['<t>'] + [t['name']] + ['{'] + [ f['name'] for f in t['fields']] + ['}'] + ['</t>'] for t in data['types']]
             type_field_flat_tokens = reduce(list.__add__, type_field_tokens)

             arguments = [a['name']  for a in data['arguments']]
             schema_tokens = type_field_flat_tokens + ['<a>'] + arguments + ['</a>']

             path = Path(schema_path)
             schema_name = basename(str(path.parent))

             self.name_to_schema[schema_name] = schema_tokens

        with open(dataset_path, 'r', encoding='utf-8') as f:
          data = json.load(f)

          for element in data:
            question_with_schema = 'translate English to GraphQL: ' + element['question']  + ' ' + ' '.join(self.name_to_schema[element['schemaId']])
            tokenized_s = tokenizer.encode_plus(question_with_schema,max_length=1024, padding=True, truncation=True, return_tensors='pt')
            self.source.append(tokenized_s)

            tokenized_t = tokenizer.encode_plus(element['query'],max_length=block_size, padding='max_length', truncation=True, return_tensors='pt')
            self.target.append(tokenized_t)
            self.schema_ids.append(element['schemaId'])

  def get_question_with_schema(self, question, schemaId):
        return 'translate English to GraphQL: ' + question  + ' ' + ' '.join(self.name_to_schema[schemaId])

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]['input_ids'].squeeze()
        target_ids = self.target[index]['input_ids'].squeeze()
        src_mask = self.source[index]['attention_mask'].squeeze()

        return { 
            'source_ids': source_ids,
                'source_mask': src_mask,
                'target_ids': target_ids,
                'target_ids_y': target_ids
                }

sys.modules["__main__"].TextToGraphQLDataset = TextToGraphQLDataset

In [28]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    dataset = TextToGraphQLDataset(tokenizer=tokenizer, type_path='train.json', block_size=102)

    length = dataset.__len__()
    item = dataset.__getitem__(0)
    print("TextToGraphQLDataset test done")

TextToGraphQLDataset test done


In [29]:
class MaskGraphQLDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='train.json', block_size=64):
        'Initialization'
        super(MaskGraphQLDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        path = './SPEGQL-dataset/dataset/' + type_path
        with open(path, "r", encoding="utf-8") as f:
          data = json.load(f)
          for example in data:

            utterance = example['query']
            encoded_source = tokenizer.encode(utterance, max_length=block_size, padding='max_length', truncation=True, return_tensors='pt').squeeze()
            token_count = encoded_source.shape[0]
            repeated_utterance = [encoded_source for _ in range(token_count)]
            for pos in range(1, token_count):
              encoded_source = repeated_utterance[pos].clone()
              target_id = encoded_source[pos].item()
              if target_id == tokenizer.eos_token_id:
                  break
              encoded_source[pos] = tokenizer.mask_token_id
              decoded_target = ''.join(tokenizer.convert_ids_to_tokens([target_id]))
              encoded_target = tokenizer.encode(decoded_target, return_tensors='pt', max_length=4, padding='max_length', truncation=True).squeeze()
              if encoded_target is not None and torch.numel(encoded_target) > 0:
                  self.target.append(encoded_target)
                  self.source.append(encoded_source)
              if torch.numel(encoded_target) > 0:
                  self.target.append(encoded_target)
                  self.source.append(encoded_source)


  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]
        target_id = self.target[index]
        return { 'source_ids': source_ids,
                'target_id': target_id}

In [30]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")

    special_tokens_dict = tokenizer.special_tokens_map # the issue could be here, might need to copy.
    special_tokens_dict['mask_token'] = '<mask>'
    special_tokens_dict['additional_special_tokens'] = ['<t>', '</t>', '<a>', '</a>']
    tokenizer.add_tokens(['{', '}', '<c>', '</c>'])
    tokenizer.add_special_tokens(special_tokens_dict)
    #model.resize_token_embeddings(len(tokenizer))
    print(tokenizer.mask_token)

    dataset = MaskGraphQLDataset(tokenizer=tokenizer, type_path='train.json', block_size=64)
    print("MaskGraphQLDataset test done")

<mask>
MaskGraphQLDataset test done


In [31]:
class SpiderDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='train_spider.json', block_size=102):
        'Initialization'
        super(SpiderDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        spider_path = './spider/'
        path = spider_path + type_path
        # TODO open up tables.json
        # its a list of tables
        # group by db_id 
        # grab column name from column_names_original ( each column name is a list of two. and the 2nd index {1} is the column name )
        # grab table names from table_names (^ same as above )
        # concat both with the english question (table names + <c> + column names + <q> english question)
        # tokenize

        # Maybe try making making more structure 
        # in the concat by using primary_keys and foreign_keys 

        tables_path = spider_path + 'tables.json'

        with open(path, 'r') as f, open(tables_path, 'r') as t:
          databases = json.load(t)
          data = json.load(f)

          #groupby db_id 
          grouped_dbs = {}
          for db in databases:
            grouped_dbs[db['db_id']] = db
          # print(grouped_dbs)
          # end grop tables

          for element in data:
            db = grouped_dbs[element['db_id']]

            # tables_names = " ".join(db['table_names_original'])
            db_tables = db['table_names_original']

            # columns_names = " ".join([column_name[1] for column_name in db['column_names_original'] ])
            tables_with_columns = ''
            for table_id, group in itertools.groupby(db['column_names_original'], lambda x: x[0]):
              if table_id == -1:
                continue

              columns_names = " ".join([column_name[1] for column_name in group ])
              tables_with_columns += '<t> ' + db_tables[table_id] + ' <c> ' + columns_names + ' </c> ' + '</t> '


            # group columns with tables. 

            db_with_question = 'translate English to SQL: ' + element['question'] + ' ' + tables_with_columns
            # question_with_schema = 'translate English to GraphQL: ' + element['question']  + ' ' + ' '.join(self.name_to_schema[element['schemaId']]) + ' </s>'

            tokenized_s = tokenizer.batch_encode_plus([db_with_question],max_length=1024, padding='max_length', truncation=True,return_tensors='pt')
            # what is the largest example size?
            # the alternative is to collate
            #might need to collate
            self.source.append(tokenized_s)

            tokenized_t = tokenizer.batch_encode_plus([element['query']],max_length=block_size, padding='max_length', truncation=True,return_tensors='pt')
            self.target.append(tokenized_t)


  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]['input_ids'].squeeze()
        target_ids = self.target[index]['input_ids'].squeeze()
        src_mask = self.source[index]['attention_mask'].squeeze()
        return { 'source_ids': source_ids,
                'source_mask': src_mask,
                'target_ids': target_ids,
                'target_ids_y': target_ids}


# # In[38]:



In [32]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    dataset = SpiderDataset(tokenizer=tokenizer , type_path='train_spider.json', block_size=102)

    length = dataset.__len__()
    item = dataset.__getitem__(0)
    print("SpiderDataset test done")

SpiderDataset test done


In [33]:
class CoSQLMaskDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, tokenizer, type_path='cosql_train.json', block_size=64):
        'Initialization'
        super(CoSQLMaskDataset, ).__init__()
        self.tokenizer = tokenizer

        self.source = []
        self.target = []
        path = './cosql_dataset/sql_state_tracking/' + type_path
        with open(path, 'r', encoding='utf-8') as f:
          data = json.load(f)
          for element in data:
            for interaction in element['interaction']:
              # repeat the squence for the amount of tokens. 
              # loop through those sequences and replace a different token in each one. 
              # the target will be that token. 
              utterance = interaction['query']
              # tokens = utterance.split()
              encoded_source = tokenizer.encode(utterance, max_length=block_size, padding='max_length', truncation=True, return_tensors='pt').squeeze()
              token_count = encoded_source.shape[0]
              # print(encoded_source.shape)
              repeated_utterance = [encoded_source for _ in range(token_count)]
              for pos in range(1, token_count):
                encoded_source = repeated_utterance[pos].clone()
                target_id = encoded_source[pos].item()
                if target_id == tokenizer.eos_token_id:
                  break
                # encoded_source[pos] = tokenizer.mask_token_id
                # self.target.append(target_id)
                # self.source.append(encoded_source)

                encoded_source[pos] = tokenizer.mask_token_id
                decoded_target = ''.join(tokenizer.convert_ids_to_tokens([target_id]))
                encoded_target = tokenizer.encode(decoded_target, return_tensors='pt', max_length=4, padding='max_length', truncation=True).squeeze() # should always be of size 1
                self.target.append(encoded_target)
                self.source.append(encoded_source)

                # repeated_utterance[pos][pos] = target_token # so that the next iteration the previous token is correct

                
          

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.source)

  def __getitem__(self, index):
        'Generates one sample of data'
        source_ids = self.source[index]#['input_ids'].squeeze()
        target_id = self.target[index]#['input_ids'].squeeze()
        # src_mask = self.source[index]['attention_mask'].squeeze()
        return { 'source_ids': source_ids,
                'target_id': target_id}
                # 'source_mask': src_mask,
                # 'target_ids': target_ids,
                # 'target_ids_y': target_ids}

In [34]:
if test_state:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")

    special_tokens_dict = tokenizer.special_tokens_map # the issue could be here, might need to copy.
    special_tokens_dict['mask_token'] = '<mask>'
    special_tokens_dict['additional_special_tokens'] = ['<t>', '</t>', '<a>', '</a>']
    tokenizer.add_tokens(['{', '}', '<c>', '</c>'])
    tokenizer.add_special_tokens(special_tokens_dict)
    #model.resize_token_embeddings(len(tokenizer))
    print(tokenizer.mask_token)

    dataset = CoSQLMaskDataset(tokenizer=tokenizer , type_path='cosql_train.json', block_size=64)

    length = dataset.__len__()
    item = dataset.__getitem__(0)
    print("CoSQLMaskDataset test done")

<mask>
CoSQLMaskDataset test done


In [35]:
class T5MultiSPModel(pl.LightningModule):
  def __init__(self, hyperparams, task='denoise', test_flag='graphql', train_sampler=None, batch_size=2,temperature=1.0,top_k=50, top_p=1.0, num_beams=1 ):
    super(T5MultiSPModel, self).__init__()

    self.temperature = temperature
    self.top_k = top_k
    self.top_p = top_p
    self.num_beams = num_beams

    self.hyperparams = hyperparams

    self.task = task
    self.test_flag = test_flag
    self.train_sampler = train_sampler
    self.batch_size = batch_size
    if self.task == 'finetune':
      self.model = T5ForConditionalGeneration.from_pretrained('t5-base')
    else: 
      self.model = T5ForConditionalGeneration.from_pretrained('t5-base') # no output past? 

    self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
    
    self.criterion = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    self.add_special_tokens()

  def forward(
    self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
    ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def add_special_tokens(self):
    # new special tokens
    special_tokens_dict = self.tokenizer.special_tokens_map # the issue could be here, might need to copy.
    special_tokens_dict['mask_token'] = '<mask>'
    special_tokens_dict['additional_special_tokens'] = ['<t>', '</t>', '<a>', '</a>']
    self.tokenizer.add_tokens(['{', '}', '<c>', '</c>'])
    self.tokenizer.add_special_tokens(special_tokens_dict)
    self.model.resize_token_embeddings(len(self.tokenizer))

  def _step(self, batch):
    if self.task == 'finetune':
      pad_token_id = self.tokenizer.pad_token_id
      source_ids, source_mask, y = batch["source_ids"], batch["source_mask"], batch["target_ids"]
      # y_ids = y[:, :-1].contiguous()
      labels = y[:, :].clone()
      labels[y[:, :] == pad_token_id] = -100
      # attention_mask is for ignore padding on source_ids 
      # labels need to have pad_token ignored manually by setting to -100
      # todo check the ignore token for forward
      # seems like decoder_input_ids can be removed. 
      outputs = self(source_ids, attention_mask=source_mask, labels=labels,)

      loss = outputs[0]

    else: 
      y = batch['target_id']
      labels = y[:, :].clone()
      labels[y[:, :] == self.tokenizer.pad_token_id] = -100
      loss = self(
          input_ids=batch["source_ids"],
          labels=labels
      )[0]


    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)

    print(f'Validation step called, batch_idx: {batch_idx}, loss: {loss.item()}')

    return {"val_loss": loss}


  def on_validation_epoch_end(self, outputs=None):
    if not outputs:
        print("Empty outputs list.")
        return
    print("outputs " + str(outputs))
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    # if self.task == 'finetune':
    #   avg_acc = torch.stack([x["val_acc"] for x in outputs]).mean()
    #   tensorboard_logs = {"val_loss": avg_loss, "avg_val_acc": avg_acc}
    #   return {"progress_bar": tensorboard_logs, "log": tensorboard_logs}
    # else:
    tensorboard_logs = {"val_loss": avg_loss}
    return {'progress_bar': tensorboard_logs, 'log': tensorboard_logs }
    

  # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
  #   if self.trainer:
  #     xm.optimizer_step(optimizer)
  #   else:
  #     optimizer.step()
  #   optimizer.zero_grad()
  #   self.lr_scheduler.step()


  def configure_optimizers(self):
    t_total = len(self.train_dataloader()) * self.trainer.max_epochs * self.trainer.limit_train_batches
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        {"params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hyperparams.lr, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return [optimizer] #, [scheduler]

  def _generate_step(self, batch):
    generated_ids = self.model.generate(
        batch["source_ids"],
        attention_mask=batch["source_mask"],
        num_beams=self.num_beams,
        max_length=1000,
        temperature=self.temperature,
        top_k=self.top_k,
        top_p=self.top_p,
        length_penalty=1.0,
        early_stopping=True,
    )

    preds = [
        self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for g in generated_ids
    ]
    target = [
        self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for t in batch["target_ids"]
    ]
    return (preds, target)

  def test_step(self, batch, batch_idx):
    preds, target = self._generate_step(batch)
    loss = self._step(batch)
    if self.test_flag == 'graphql':
      accuracy = exact_match.exact_match_accuracy(preds,target)
      return {"test_loss": loss, "test_accuracy": torch.tensor(accuracy)}
    else: 
      return {"test_loss": loss, "preds": preds, "target": target }

  # def test_end(self, outputs):
  #   return self.validation_end(outputs)


  def test_epoch_end(self, outputs):
    avg_loss = torch.stack([x["test_loss"] for x in outputs]).mean()
    
    if self.test_flag == 'graphql':
      avg_acc = torch.stack([x["test_accuracy"] for x in outputs]).mean()
      tensorboard_logs = {"test_loss": avg_loss, "test_acc": avg_acc}
      return {"progress_bar": tensorboard_logs, "log": tensorboard_logs}

    else:
      output_test_predictions_file = os.path.join(os.getcwd(), "test_predictions.txt")
      with open(output_test_predictions_file, "w+") as p_writer:
          for output_batch in outputs:
              p_writer.writelines(s + "\n" for s in output_batch["preds"])
          p_writer.close()
      tensorboard_logs = {"test_loss": avg_loss}
      return {"progress_bar": tensorboard_logs, "log": tensorboard_logs}

  def prepare_data(self):
    if self.task == 'finetune':
      self.train_dataset_g = TextToGraphQLDataset(self.tokenizer)
      self.val_dataset_g = TextToGraphQLDataset(self.tokenizer, type_path='dev.json')
      self.test_dataset_g = TextToGraphQLDataset(self.tokenizer, type_path='dev.json')

      self.train_dataset_s = SpiderDataset(self.tokenizer)
      self.val_dataset_s = SpiderDataset(self.tokenizer, type_path='dev.json')
      self.test_dataset_s = SpiderDataset(self.tokenizer, type_path='dev.json')

      self.train_dataset = ConcatDataset([self.train_dataset_g,self.train_dataset_s])
      self.val_dataset = ConcatDataset([self.val_dataset_g, self.val_dataset_s])
      # self.test_dataset = ConcatDataset([test_dataset_g, test_dataset_s])
      if self.test_flag == 'graphql':
        self.test_dataset = self.test_dataset_g
      else:
        self.test_dataset = self.test_dataset_s
      
    else:
      train_dataset_g = MaskGraphQLDataset(self.tokenizer)
      val_dataset_g = MaskGraphQLDataset(self.tokenizer, type_path='dev.json')

      train_dataset_s = CoSQLMaskDataset(self.tokenizer)
      val_dataset_s = CoSQLMaskDataset(self.tokenizer, type_path='cosql_dev.json')

      self.train_dataset = ConcatDataset([train_dataset_g, train_dataset_s])
      self.val_dataset = ConcatDataset([val_dataset_g,val_dataset_s])

  @staticmethod
  def custom_collate_fn(batch):
    keys = batch[0].keys()
    collated_batch = {}

    for key in keys:
        if key in ['source_ids', 'target_ids']:
            max_length = max([len(sample[key]) for sample in batch])
            padded_tensors = [torch.cat([sample[key], torch.zeros(max_length - len(sample[key]), dtype=torch.long)], dim=0) for sample in batch]
            collated_batch[key] = torch.stack(padded_tensors, dim=0)
        else:
            max_length = max([len(sample[key]) for sample in batch])
            padded_tensors = [torch.cat([sample[key], torch.zeros(max_length - len(sample[key]), dtype=torch.long)], dim=0) for sample in batch]
            collated_batch[key] = torch.stack(padded_tensors, dim=0)

    return collated_batch

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.custom_collate_fn, num_workers=0)

  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self.custom_collate_fn, num_workers=0)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=self.custom_collate_fn, num_workers=0)

In [36]:

# %load_ext tensorboard
%reload_ext tensorboard

%tensorboard --logdir lightning_logs/



In [37]:
hyperparams = argparse.Namespace(**{'lr': 0.0004365158322401656}) # for 3 epochs

# # system = ConvBartSystem(dataset, train_sampler, batch_size=2)
system = T5MultiSPModel(hyperparams,batch_size=32)
print("We initialize the T5MultiSPModel(hyperparams,batch_size=32)")

We initialize the T5MultiSPModel(hyperparams,batch_size=32)


In [38]:
# Initialize the logger
logger = TensorBoardLogger("lightning_logs/")
# Pass the logger to the Trainer
trainer = pl.Trainer(logger=logger)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [39]:
if os.path.exists('model_weights.pth'):
    system.load_state_dict(torch.load('model_weights.pth'))
    print("model weights loaded from model_weights.pth")

else:
    # If the weights file doesn't exist, train the model and save the weights after training
    print("lets train this model!")
    if (use_gpu):
      trainer = Trainer(accelerator='gpu', max_epochs=1, log_every_n_steps=1, limit_train_batches=0.2, gpus=1)
    else:
      trainer = Trainer(max_epochs=1, log_every_n_steps=1, limit_train_batches=0.2)
    trainer.fit(system)
    torch.save(system.state_dict(), 'model_weights.pth')

model weights loaded from model_weights.pth


In [40]:
system.prepare_data()

In [41]:
inputs = system.val_dataset[0]
system.tokenizer.decode(inputs['source_ids'])

if(use_gpu == True):
  system.model = system.model.cuda()
else:
  system.model = system.model.cpu()
generated_ids = system.model.generate(inputs['source_ids'].unsqueeze(0), num_beams=5, repetition_penalty=1.0, max_length=56, early_stopping=True)
# # # summary_text = system.tokenizer.decode(generated_ids[0])

hyps = [system.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]

print("hyps")
print(hyps)

hyps
['{']


In [42]:
if os.path.exists('fine_tuned_model_weights.pth'):
    # Load the model weights if the file exists
  print("Model is allready fine-tuned, loading weights...")
  system.load_state_dict(torch.load('fine_tuned_model_weights.pth'))
  print("fine_tuned_model_weights.pth loaded")

else:
  print("Let's fine-tune this model!")
  if(use_gpu):
    trainer = Trainer(gpus=1, max_epochs=5, progress_bar_refresh_rate=1, val_check_interval=0.5)
  else:
    trainer = Trainer(max_epochs=5, progress_bar_refresh_rate=1, val_check_interval=0.5)
  trainer.fit(system)
  torch.save(system.state_dict(), 'fine_tuned_model_weights.pth')
  
from pytorch_lightning.callbacks import ModelCheckpoint

Model is allready fine-tuned, loading weights...
fine_tuned_model_weights.pth loaded


In [52]:
def predict(prompt, schemaId):

    if system.train_dataset_g.name_to_schema[schemaId] is not None:
        input_string = system.train_dataset_g.get_question_with_schema(prompt, schemaId)
    elif system.dev_dataset.name_to_schema[schemaId] is not None:
        input_string = system.val_dataset_g.get_question_with_schema(prompt, schemaId)
    print(input_string)

    inputs = system.tokenizer.batch_encode_plus([input_string], max_length=1024, return_tensors='pt')['input_ids']
    print(inputs.shape)

    if(use_gpu == True):
      generated_ids = system.model.generate(inputs.cuda(), num_beams=3, repetition_penalty=1.0, max_length=1000, early_stopping=True)
    else:
      generated_ids = system.model.generate(inputs, num_beams=3, repetition_penalty=1.0, max_length=1000, early_stopping=True)
    hyps = [system.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
    dict_res = {"prediction": hyps[0]}
    return dict_res

In [61]:
import json
from graphql import parse, print_ast

def are_queries_semantically_same(query1, query2):
    def extract_fields(selection_set):
        return {
            field.name.value: extract_fields(field.selection_set) if field.selection_set else None
            for field in selection_set.selections
        }
    
    def sort_query_fields(query_dict):
        for key, value in query_dict.items():
            if isinstance(value, dict):
                query_dict[key] = sort_query_fields(value)
        return {k: v for k, v in sorted(query_dict.items())}
    
    try :
        ast1 = parse(query1)
        ast2 = parse(query2)
    except:
        return False
    
    query1_dict = extract_fields(ast1.definitions[0].selection_set)
    query2_dict = extract_fields(ast2.definitions[0].selection_set)

    sorted_query1 = sort_query_fields(query1_dict)
    sorted_query2 = sort_query_fields(query2_dict)

    return sorted_query1 == sorted_query2


In [45]:
if(test_state == True):
  query1 = """
  query {
    user {
      id
      name
      email
      posts {
        title
        content
      }
    }
  }
  """

  query2 = """
  query {
    user {
      name
      id
      posts {
        content
        title
      }
      email
    }
  }
  """

  query3 = """
  query {
    user {
      id
      name
      posts {
        title
        content
      }
    }
  }
  """

print(are_queries_semantically_same(query1, query2))  # Should print True
print(are_queries_semantically_same(query1, query3))  # Should print False

query4 = """
query {
matches_aggregate (where: { _and: { winner_hand: { _eq: "L" }, tourney_name: { _eq: "WTA Championships" } } }) {
aggregate {
count
}
}
}
"""

query5 = """
query {
matches_aggregate (where: { _and: { tourney_name: { _eq: "WTA Championships" }, winner_hand: { _eq: "L" } } }) {
aggregate {
count
}
}
}
"""

print(are_queries_semantically_same(query4, query5))  # Should print True

True
False
True


In [62]:
import json
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

def custom_test(dataset_path):
    # Load the dev dataset
    with open(dataset_path, "r") as f:
        dev_data = json.load(f)

    exact_match = []
    semantic_match = []

    for example in dev_data:
        prompt = example["question"]
        schemaId = example["schemaId"]
        query = example["query"]

        prediction = predict(prompt, schemaId)["prediction"]

        exact_match.append(1 if prediction == query else 0)
        semantic_match.append(1 if are_queries_semantically_same(prediction, query) else 0)

    def calculate_metrics(match_results):
        accuracy = accuracy_score(match_results, np.ones(len(match_results)))
        f1 = f1_score(match_results, np.ones(len(match_results)))
        precision = precision_score(match_results, np.ones(len(match_results)))
        recall = recall_score(match_results, np.ones(len(match_results)))
        return accuracy, f1, precision, recall

    # Calculate evaluation metrics for exact match
    exact_accuracy, exact_f1, exact_precision, exact_recall = calculate_metrics(exact_match)

    # Calculate evaluation metrics for semantic match
    semantic_accuracy, semantic_f1, semantic_precision, semantic_recall = calculate_metrics(semantic_match)

    print("Exact match:")
    print(f"Accuracy: {exact_accuracy:.4f}")
    print(f"F1-score: {exact_f1:.4f}")
    print(f"Precision: {exact_precision:.4f}")
    print(f"Recall: {exact_recall:.4f}")
    
    print("\nSemantic match:")
    print(f"Accuracy: {semantic_accuracy:.4f}")
    print(f"F1-score: {semantic_f1:.4f}")
    print(f"Precision: {semantic_precision:.4f}")
    print(f"Recall: {semantic_recall:.4f}")



In [47]:
system.task = 'finetune'
system.prepare_data()

In [48]:
hardcoded_schemaId = "battle_death"
hardcoded_prompt = "How many ships ended up being not 'Captured'?"

result = predict(hardcoded_prompt, hardcoded_schemaId)
print("this is the result")
print(result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


translate English to GraphQL: How many ships ended up being not 'Captured'? <t> battle { bulgarian_commander date id latin_commander name result ships ships_aggregate } </t> <t> battle_aggregate { aggregate nodes } </t> <t> death { caused_by_ship_id id injured killed note ship } </t> <t> death_aggregate { aggregate nodes } </t> <t> ship { battle deaths deaths_aggregate disposition_of_ship id location lost_in_battle name ship_type tonnage } </t> <t> ship_aggregate { aggregate nodes } </t> <a> distinct_on limit offset order_by where </a>
torch.Size([1, 147])
this is the result
{'prediction': 'query { ship_aggregate ( where : { tonnage : { _lt : "Captured" } } ) { aggregate { count } } }'}


In [63]:
dev_dataset_path = 'SPEGQL-dataset/dataset/dev.json'

custom_test(dev_dataset_path)


translate English to GraphQL: How many ships ended up being 'Captured'? <t> battle { bulgarian_commander date id latin_commander name result ships ships_aggregate } </t> <t> battle_aggregate { aggregate nodes } </t> <t> death { caused_by_ship_id id injured killed note ship } </t> <t> death_aggregate { aggregate nodes } </t> <t> ship { battle deaths deaths_aggregate disposition_of_ship id location lost_in_battle name ship_type tonnage } </t> <t> ship_aggregate { aggregate nodes } </t> <a> distinct_on limit offset order_by where </a>
torch.Size([1, 146])
translate English to GraphQL: List the name and tonnage ordered by in descending alphaetical order for the names. <t> battle { bulgarian_commander date id latin_commander name result ships ships_aggregate } </t> <t> battle_aggregate { aggregate nodes } </t> <t> death { caused_by_ship_id id injured killed note ship } </t> <t> death_aggregate { aggregate nodes } </t> <t> ship { battle deaths deaths_aggregate disposition_of_ship id location

In [None]:
# system.num_beams = 3
# system.test_flag = 'graphql'
# system.prepare_data()
# trainer.test(system)

Testing: 0it [00:00, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/jakobtolstrup/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/jakobtolstrup/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TextToGraphQLDataset' on <module '__main__' (built-in)>
