### 0. Installing Transformers and Importing Dependencies

In [1]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.20.1-py3-none-any.whl (4.4 MB)
Collecting tqdm>=4.27
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Collecting filelock
  Downloading filelock-3.7.1-py3-none-any.whl (10 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp310-cp310-win_amd64.whl (3.3 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Collecting numpy>=1.17
  Downloading numpy-1.23.0-cp310-cp310-win_amd64.whl (14.6 MB)
Collecting regex!=2019.12.17
  Downloading regex-2022.6.2-cp310-cp310-win_amd64.whl (262 kB)
Collecting packaging>=20.0
  Using cached packaging-21.3-py3-none-any.whl (40 kB)
Collecting pyparsing!=3.0.5,>=2.0.2
  Using cached pyparsing-3.0.9-py3-none-any.whl (98 kB)
Installing collected packages: pyparsing, tqdm, packaging, filelock, tokenizers, regex, numpy, huggingface-hub, transformers
Successfully installed filelock-3.7.1 huggingface-hub-0.8.1 numpy-1.23.0

In [2]:
%config Completer.use_jedi = False # Enable Jupyter auto-complete function

### 1. Load Question Generating Pipeline

In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aviparna.biswas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
class QueGenerator():
  def __init__(self):
    self.que_model = T5ForConditionalGeneration.from_pretrained('./t5_que_gen_model/t5_base_que_gen/')
    self.ans_model = T5ForConditionalGeneration.from_pretrained('./t5_ans_gen_model/t5_base_ans_gen/')

    self.que_tokenizer = T5Tokenizer.from_pretrained('./t5_que_gen_model/t5_base_tok_que_gen/')
    self.ans_tokenizer = T5Tokenizer.from_pretrained('./t5_ans_gen_model/t5_base_tok_ans_gen/')
    
    self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    self.que_model = self.que_model.to(self.device)
    self.ans_model = self.ans_model.to(self.device)
  
  def generate(self, text):
    answers = self._get_answers(text)
    questions = self._get_questions(text, answers)
    output = [{'answer': ans, 'question': que} for ans, que in zip(answers, questions)]
    return output
  
  def _get_answers(self, text):
    # split into sentences
    sents = sent_tokenize(text)

    examples = []
    for i in range(len(sents)):
      input_ = ""
      for j, sent in enumerate(sents):
        if i == j:
            sent = "[HL] %s [HL]" % sent
        input_ = "%s %s" % (input_, sent)
        input_ = input_.strip()
      input_ = input_ + " </s>"
      examples.append(input_)
    
    batch = self.ans_tokenizer.batch_encode_plus(examples, max_length=512, pad_to_max_length=True, return_tensors="pt")
    with torch.no_grad():
      outs = self.ans_model.generate(input_ids=batch['input_ids'].to(self.device), 
                                attention_mask=batch['attention_mask'].to(self.device), 
                                max_length=32,
                                # do_sample=False,
                                # num_beams = 4,
                                )
    dec = [self.ans_tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
    answers = [item.split('[SEP]') for item in dec]
    answers = chain(*answers)
    answers = [ans.strip() for ans in answers if ans != ' ']
    return answers
  
  def _get_questions(self, text, answers):
    examples = []
    for ans in answers:
      input_text = "%s [SEP] %s </s>" % (ans, text)
      examples.append(input_text)
    
    batch = self.que_tokenizer.batch_encode_plus(examples, max_length=512, pad_to_max_length=True, return_tensors="pt")
    with torch.no_grad():
      outs = self.que_model.generate(input_ids=batch['input_ids'].to(self.device), 
                                attention_mask=batch['attention_mask'].to(self.device), 
                                max_length=32,
                                num_beams = 4)
    dec = [self.que_tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
    return dec 

### 2. Generate Questions

In [5]:
que_generator = QueGenerator()

In [18]:
text = "For other organizations, the attendance feature is working fine. This seems to be a permission issue from the admin portal.Please check from the admin portal whether the attendance features have been given to the users or not.Please feel free to call in case of any questions."

In [19]:
que_generator.generate(text)



[{'answer': '<pad> other organizations',
  'question': '<pad> For what type of organizations is the attendance feature working fine?</s> <pad> <pad> <pad> <pad> <pad>'},
 {'answer': '</s>',
  'question': '<pad> What is the permission issue with the attendance feature?</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'},
 {'answer': '<pad> permission issue',
  'question': '<pad> What does the admin portal think is the reason for the failure of the attendance feature?</s>'},
 {'answer': '</s>',
  'question': '<pad> What is the permission issue with the attendance feature?</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'}]

In [20]:
text2 = "The PCAOB staff provides guidance to help firms when implementing CAM requirements. Staff may update this guidance as needed."

In [21]:
que_generator.generate(text2)



[{'answer': '<pad> PCAOB',
  'question': '<pad> Who provides guidance to firms when implementing CAM requirements?</s> <pad> <pad> <pad> <pad>'},
 {'answer': '</s>',
  'question': '<pad> What does the PCAOB provide to firms when implementing CAM requirements?</s>'},
 {'answer': '<pad> Staff',
  'question': '<pad> Who provides guidance to firms when implementing CAM requirements?</s> <pad> <pad> <pad> <pad>'},
 {'answer': '</s> <pad> <pad>',
  'question': '<pad> What type of guidance does the PCAOB provide?</s> <pad> <pad> <pad> <pad> <pad>'}]

In [22]:
text3 = "NexAEI is a mobile-based attendance capturing system that helps you manage the decentralized attendance of your hybrid workforce. Transform the payroll experience in your organization by leveraging our advanced attendance capturing modes and easily integrate the data with your existing ERP system."

In [23]:
que_generator.generate(text3)



[{'answer': '<pad> NexAEI',
  'question': '<pad> What is the name of the mobile-based attendance capturing system?</s>'},
 {'answer': '</s>',
  'question': '<pad> What is the purpose of NexAEI?</s> <pad> <pad> <pad> <pad> <pad>'},
 {'answer': '<pad> ERP system',
  'question': '<pad> What does NexAEI integrate with?</s> <pad> <pad> <pad> <pad> <pad> <pad>'},
 {'answer': '</s> <pad>',
  'question': "<pad> What is NexAEI's advanced attendance capturing modes?</s> <pad>"}]