# Example of generating QAs from an ML book (using LMGQ)

### Import packages

In [1]:
import os
import pandas as pd
import sys
sys.path.append(os.path.join(os.getcwd(), os.pardir, os.pardir))
from uniflow.client import Client
from uniflow.flow.constants import (OUTPUT_NAME, QAPAIR_DF_KEY, INPUT_FILE, ERROR_LIST, OUTPUT_FILE)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
  from .autonotebook import tqdm as notebook_tqdm


We will need a sapcy package `en_core_web_sm` which is a small English pipeline trained on written web text (blogs, news, comments), that includes vocabulary, syntax and entities. If you haven't installed this package, run the line below:

In [2]:
# !python -m spacy download en_core_web_sm


### Load Data

In [3]:
dir_cur = os.getcwd()
# fname = "umich.txt"
# input_file = os.path.join(f"{dir_cur}/data/raw_input/", fname)

html_file = "22.11_information-theory.html"
input_file = os.path.join(f"{dir_cur}/data/raw_input/", html_file)

### synthetic data generation 

Note it will take about 8 minutes to run this cell if you on a single GPU (V100) machine.

In [4]:
client = Client("flow_data_gen_text")
input_dict = {INPUT_FILE: input_file}
input_list = [input_dict]
output_list = client.run(input_list)
output_dict = output_list[0]

print(f"output_dict keys: {output_dict.keys()}")

INFO [preprocess_file_op]: Starting Preprocess HTML...
INFO [preprocess_file_op]: Preprocess HTML Complete!
INFO [preprocess_text_op]: Preprocessing text content input...
INFO [preprocess_text_op]: Preprocessing text content input...Done!
INFO [lmqg_op]: Initializing LMQGOp...
INFO [lmqg_op]: LMQGOp initialization complete!
INFO [lmqg_op]: Generating question and answer pairs for paragraph 1 of 237
100%|██████████| 1/1 [00:00<00:00, 650.99it/s]
100%|██████████| 1/1 [00:00<00:00, 1826.79it/s]
INFO [lmqg_op]: Generating question and answer pairs for paragraph 2 of 237
100%|██████████| 3/3 [00:00<00:00, 2341.88it/s]
100%|██████████| 3/3 [00:00<00:00, 2286.97it/s]
INFO [lmqg_op]: Generating question and answer pairs for paragraph 3 of 237
100%|██████████| 1/1 [00:00<00:00, 1793.97it/s]
INFO [lmqg_op]: Generating question and answer pairs for paragraph 4 of 237
100%|██████████| 1/1 [00:00<00:00, 1837.99it/s]
100%|██████████| 1/1 [00:00<00:00, 1797.05it/s]
INFO [lmqg_op]: Generating question

output_dict keys: dict_keys(['output', 'root'])


In [5]:
# number of output nodes
len(output_dict[OUTPUT_NAME])

1

In [6]:
# output dictionary keys
output_dict[OUTPUT_NAME][0].keys()

dict_keys(['QApair_df', 'error_list', 'output_file'])

In [10]:
# Set this option to None to display full contents of each column
pd.set_option('display.max_colwidth', None)

# print the first 10 entries in the generated question-answer pairs.
output_dict[OUTPUT_NAME][0][QAPAIR_DF_KEY][:10]

Unnamed: 0,Question,Answer
0,What is the term for a dive into information theory?,deep learning
1,What is the name of the appendix?,mathematics for deep learningnavigate_next
2,What is the name of the appendix?,mathematics for deep learningnavigate_next
3,What type of theory is used in deep learning?,information theory
4,What is a quick search?,quick search
5,What is a code?,code
6,What is the name of the source?,show source
7,What is the name of the preview version?,preview version
8,What is the name of the program that runs on a pytorch?,pytorch
9,What is the name of the website that hosts the internet?,mxnet


In [8]:
# print out the output file path
output_dict[OUTPUT_NAME][0][OUTPUT_FILE]

'/home/ubuntu/uniflow/example/qa_generation/data/output/output_qa_text_data.csv'

In [9]:
# print out any errors
print(f"Error generating QA for {len(output_dict[OUTPUT_NAME][0][ERROR_LIST])} paragraphs.")
output_dict[OUTPUT_NAME][0][ERROR_LIST]

Error generating QA for 32 paragraphs.


[{'paragraph': 'search',
  'error': "AnswerNotFoundError('Model cannot find any answer candidates in `search`')"},
 {'paragraph': ' 中文版\n ',
  'error': "AnswerNotFoundError('Model cannot find any answer candidates in ` 中文版\\n `')"},
 {'paragraph': 'preface\ninstallation\nnotation',
  'error': "AnswerNotFoundError('Model cannot find any answer candidates in `preface\\ninstallation\\nnotation`')"},
 {'paragraph': 'preface\ninstallation\nnotation',
  'error': "AnswerNotFoundError('Model cannot find any answer candidates in `preface\\ninstallation\\nnotation`')"},
 {'paragraph': '22.11.1. information¶\nlet’s start with the “soul” of information theory: information.\ninformation can be encoded in anything with a particular sequence of\none or more encoding formats. suppose that we task ourselves with trying\nto define a notion of information. what could be our starting point?\nconsider the following thought experiment. we have a friend with a deck\nof cards. they will shuffle the deck, flip