In [1]:
# Install dependencies
!pip install transformers datasets tokenizers sentencepiece torch sentence-transformers tqdm nltk pandas numpy
import nltk
nltk.download('punkt')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 11.9 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 6.6 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 47.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 43.4 MB/s 
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
import pandas as pd

# Loading sampled preprocessed data
sampled_question_answer_context_df = pd.read_json('./sampled_question_answer_context.json')

# sampled_question_answer_context_df.head()

In [None]:
# T5-base model fine-tuned on SQuAD v1 for Question generation 
from transformers import AutoTokenizer, T5ForConditionalGeneration

class T5BaseQG():
  def __init__(self, max_length = 128):
    self.model_name = "mrm8488/t5-base-finetuned-question-generation-ap"
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
    self.max_length = max_length

  def generate_question(self, answer, context):
    tokenizer = self.tokenizer
    model = self.model
    input_text = "answer: %s  context: %s </s>" % (answer, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'],
                max_length=self.max_length)

    return tokenizer.decode(output[0], skip_special_tokens=True)[len("question: "):]

qg_model = T5BaseQG()
qg_model_name = 't5-base'

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [None]:
# T5-large model fine-tuned on SQuAD v1 for Question generation 

import torch
from transformers import PreTrainedTokenizerFast, T5ForConditionalGeneration

class T5LargeQG():
  def __init__(self, max_length = 128):
    self.model_name = "Sehong/t5-large-QuestionGeneration"
    self.tokenizer = PreTrainedTokenizerFast.from_pretrained(self.model_name)
    self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
    self.max_length = max_length

  def generate_question(self, answer, content):
    tokenizer = self.tokenizer
    model = self.model
    input_text = "answer:%s content:%s </s>" % (answer, content)
    raw_input_ids = tokenizer.encode(input_text)
    input_ids = [tokenizer.bos_token_id] + raw_input_ids + [tokenizer.eos_token_id]
    question_ids = model.generate(torch.tensor([input_ids]))
    decode = tokenizer.decode(question_ids.squeeze().tolist(), skip_special_tokens=True)
    decode = decode.replace(' # # ', '').replace('  ', ' ').replace(' ##', '')
    return decode

qg_model = T5LargeQG()
qg_model_name = 't5-large'

In [9]:
# BART-base model fine-tuned on SQuAD v1 for Question generation 
from transformers import AutoTokenizer, BartForConditionalGeneration

class BartBaseQG():
  def __init__(self):
    self.model_name = "kaejo98/bart-base_question_generation"
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    self.model = BartForConditionalGeneration.from_pretrained(self.model_name)

  def generate_question(self, answer, context):
    tokenizer = self.tokenizer
    model = self.model
    input_text = "<generate_context_questions> <section> %s </section>  paragraph: %s" % (answer, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'], max_length=512)

    return tokenizer.decode(output[0], skip_special_tokens=True)

qg_model = BartBaseQG()
qg_model_name = 'bart-base'

In [26]:
# GPT2-base model fine-tuned for Question generation 
from transformers import GPT2Tokenizer, GPT2LMHeadModel

class GPT2SmallQG():
  def __init__(self):
    self.model_name = "danyaljj/gpt2_question_generation_given_paragraph_answer"
    self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    self.model = GPT2LMHeadModel.from_pretrained(self.model_name)

  def generate_question(self, answer, context):
    tokenizer = self.tokenizer
    model = self.model
    input_text = "%s A: %s Q:" % (context, answer)
    features = tokenizer([input_text], return_tensors='pt')
    
    outputs = model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'], max_length=512)
    generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_question[generated_question.find("Q: "):][len("Q: "):]

qg_model = GPT2SmallQG()
qg_model_name = 'gpt2-small'

In [31]:
# Distilbart-qgen-6-6 
from transformers import AutoTokenizer, BartForConditionalGeneration

class DistilBartQG():
  def __init__(self):
    self.model_name = "gpssohi/distilbart-qgen-6-6"
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    self.model = BartForConditionalGeneration.from_pretrained(self.model_name)

  def generate_question(self, answer, context):
    tokenizer = self.tokenizer
    model = self.model
    input_text = "%s <s> %s" % (answer, context)
    features = tokenizer([input_text], return_tensors='pt')
    
    outputs = model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'], max_length=512)
    generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_question

qg_model = DistilBartQG()
qg_model_name = 'distilbart'

In [3]:
# T5-base model fine-tuned on SQuAD v1 for question answering

from transformers import AutoModelWithLMHead, AutoTokenizer

class T5BaseQA():
  def __init__(self):
    self.model_name = "valhalla/t5-base-squad"
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    self.model = AutoModelWithLMHead.from_pretrained(self.model_name)

  def answer(self, question, context):
    tokenizer = self.tokenizer
    model = self.model
    input_text = "question: %s context: %s </s>" % (question, context)
    features = tokenizer([input_text], return_tensors='pt')
    out = model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'])
    generated_answer = tokenizer.decode(out[0], skip_special_tokens=True)
    return generated_answer

qa_model = T5BaseQA()

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [29]:
from sentence_transformers import SentenceTransformer, util
from nltk import ngrams
import re
import string

sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

# semantic similarity helper
def compute_semantic_similarity(sentence1, sentence2, threshold = 0.9):
  embeddings1 = sentence_transformer.encode(sentence1, convert_to_tensor=True)
  embeddings2 = sentence_transformer.encode(sentence2, convert_to_tensor=True)
  cosine_scores = util.cos_sim(embeddings1, embeddings2)
  return float(cosine_scores[0][0])

def ngram(document, max_n_gram = 3):
  ngrams_list = []
  for i in range(1, max_n_gram + 1):
      splitted = ngrams(document.split(), i)
      ngrams_list.append(set(splitted))
  return ngrams_list

# PINC score helper
# https://github.com/kstats/MultiQuestionGeneration/blob/main/metrics/metric_diversity.py#L7
def compute_pinc_score(sentence1, sentence2, max_n_gram = 3):
  # the N in the N-gram tokenization cannot exceed the number of words in the document
  max_n_gram = min(
      len(sentence1.split()), len(sentence2.split()), max_n_gram
  )

  # if question is blank, then score is 0
  if max_n_gram == 0:
      return 0

  sentence1_ngram_list = ngram(sentence1, max_n_gram)
  sentence2_ngram_list = ngram(sentence2, max_n_gram)
  # we tokenize the groundtruth document and the prediction sentences
  # and create a 1-D array which contains all the n grams, where n ranges
  # from 1 to N
  PINC_score = 0
  for j in range(max_n_gram):
      overlap_count = 0
      for elem in sentence2_ngram_list[j]:
          if elem in sentence1_ngram_list[j]:
              overlap_count += 1
      PINC_score += 1 - overlap_count / len(sentence2_ngram_list[j])
  PINC_score *= 1 / max_n_gram
  return PINC_score

# normalizing text helper
# Reference: https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ (SQuAD v1)
def normalize_sentence(sentence):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(sentence))))

In [32]:
# parallel processing helper
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import numpy as np

total_cpus = cpu_count()
dataset_length = len(sampled_question_answer_context_df)
sampled_question_answer_context_list = sampled_question_answer_context_df.to_dict('records')

def compute_parallely(fn):
  model_generated_result_df = pd.DataFrame(columns=["question", "answer", "source", "generated_question", "generated_answer"])
  with Pool(total_cpus) as pool:
    results = list(tqdm(pool.imap(fn, sampled_question_answer_context_list), total=dataset_length))
    for result in results:
      model_generated_result_df = model_generated_result_df.append(result, ignore_index=True)
  model_generated_result_df.to_json('./' + qg_model_name + '-model_generated_result.json', orient="records")
  
# Evaluate using QA model and using sentence transformer (semantic similarity between generated question and actual question)
def evalute_using_semantic_similarity(data):
  passage = data["passage"]
  generated_question = qg_model.generate_question(data["answer"], passage)
  generated_answer = qa_model.answer(generated_question, passage)
  normalized_question = normalize_sentence(data["question"])
  normalized_answer = normalize_sentence(data["answer"])
  normalized_generated_question = normalize_sentence(generated_question)
  normalized_generated_answer = normalize_sentence(generated_answer)

  return {
      "generated_question": generated_question,
      "question": data["question"],
      "answer": data["answer"],
      "generated_answer": generated_answer,
      "source": data["source"],
      "question_semantic_similarity": compute_semantic_similarity(normalized_generated_question, normalized_question),
      "answer_semantic_similarity": compute_semantic_similarity(normalized_generated_answer, normalized_answer),
  }

compute_parallely(evalute_using_semantic_similarity)

  1%|          | 1/100 [00:08<13:46,  8.35s/it]
