# Example of generating QAs for an ML book (using self-instruct)
Source: https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/information-theory.html

## Load packages

In [1]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")

In [2]:
import os
import pandas as pd
from uniflow.client import Client
from langchain.document_loaders import UnstructuredHTMLLoader


  from .autonotebook import tqdm as notebook_tqdm


## Prepare the input data

Uncomment any of the html files below as the sample file to build the self-instruct flow.

In [3]:
#html_file = "do_things_that_dont_scale.html" #from http://paulgraham.com/ds.html
#html_file = "makers_schedule_managers_schedule.html" #from http://www.paulgraham.com/makersschedule.html
#html_file = "life_is_short.html" #http://www.paulgraham.com/vb.html
html_file = "22.11_information-theory.html"

Set current directory and input data directory.

In [4]:
dir_cur = os.getcwd()
input_file = os.path.join(f"{dir_cur}/data/raw_input/", html_file)

In [5]:
loader = UnstructuredHTMLLoader(input_file)
pages = loader.load_and_split()

## Prepare input dataset

In [6]:
data = [{
    "instruction": """Generate one question and its corresponding answer based on the context. Following the format of the examples below to include context, question, and answer in the response.""",
    "examples": [
        {
            "context": """The quick brown fox jumps over the lazy dog.""",
            "question": """What is the color of the fox?""",
            "answer": """brown."""
        },
        {
            "context": p,
            "question": """""",
            "answer": """""",
        }
    ],
} for p in pages[2].page_content.split("\n\n") if len(p) > 200]


In [7]:
data = data[-3:]
data


[{'instruction': 'Generate one question and its corresponding answer based on the context. Following the format of the examples below to include context, question, and answer in the response.',
  'examples': [{'context': 'The quick brown fox jumps over the lazy dog.',
    'question': 'What is the color of the fox?',
    'answer': 'brown.'},
   {'context': 'Any notion of information we develop must conform to this intuition.\nIndeed, in the next sections we will learn how to compute that these\nevents have \\(0\\textrm{ bits}\\), \\(2\\textrm{ bits}\\),\n\\(~5.7\\textrm{ bits}\\), and \\(~225.6\\textrm{ bits}\\) of\ninformation respectively.',
    'question': '',
    'answer': ''}]},
 {'instruction': 'Generate one question and its corresponding answer based on the context. Following the format of the examples below to include context, question, and answer in the response.',
  'examples': [{'context': 'The quick brown fox jumps over the lazy dog.',
    'question': 'What is the color of t

## Run ModelFlow

In [8]:
client = Client("ModelFlow")

In [9]:
output = client.run(data)

100%|██████████| 3/3 [00:04<00:00,  1.61s/it]


In [10]:
output

[{'output': [{'response': [{'context': 'Any notion of information we develop must conform to this intuition. Indeed, in the next sections we will learn how to compute that these events have (0 bits), (2 bits), (~5.7 bits), and (~225.6 bits) of information respectively.',
      'question': 'How much information do these events have?',
      'answer': '(0 bits), (2 bits), (~5.7 bits), and (~225.6 bits) respectively.'}]}],
  'root': <uniflow.node.node.Node at 0x2a8bea860>},
 {'output': [{'response': [{'context': 'If we read through these thought experiments, we see a natural idea. As\na starting point, rather than caring about the knowledge, we may build\noff the idea that information represents the degree of surprise or the\nabstract possibility of the event. For example, if we want to describe\nan unusual event, we need a lot information. For a common event, we may\nnot need much information.',
      'question': 'What does the amount of information represent in describing an event?',
  

## Format result into pandas table

In [11]:
# Extracting context, question, and answer into a DataFrame
contexts = []
questions = []
answers = []

for item in output:
    for i in item['output']:
        for response in i['response']:
            contexts.append(response['context'])
            questions.append(response['question'])
            answers.append(response['answer'])

df = pd.DataFrame({
    'context': contexts,
    'question': questions,
    'answer': answers
})

df.head()

Unnamed: 0,context,question,answer
0,Any notion of information we develop must conf...,How much information do these events have?,"(0 bits), (2 bits), (~5.7 bits), and (~225.6 b..."
1,"If we read through these thought experiments, ...",What does the amount of information represent ...,The amount of information represents the degre...
2,"In 1948, Claude E. Shannon published A Mathem...",Who published A Mathematical Theory of Communi...,Claude E. Shannon.
