# Example of generating QAs for an ML book (using self-instruct)
Source: https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/information-theory.html

## Load packages

In [1]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")

In [2]:
import os
import pandas as pd
from uniflow.client import Client
from langchain.document_loaders import UnstructuredHTMLLoader


  from .autonotebook import tqdm as notebook_tqdm


## Prepare the input data

Uncomment any of the html files below as the sample file to build the self-instruct flow.

In [3]:
#html_file = "do_things_that_dont_scale.html" #from http://paulgraham.com/ds.html
#html_file = "makers_schedule_managers_schedule.html" #from http://www.paulgraham.com/makersschedule.html
#html_file = "life_is_short.html" #http://www.paulgraham.com/vb.html
html_file = "22.11_information-theory.html"

Set current directory and input data directory.

In [4]:
dir_cur = os.getcwd()
input_file = os.path.join(f"{dir_cur}/data/raw_input/", html_file)

In [5]:
loader = UnstructuredHTMLLoader(input_file)
pages = loader.load_and_split()

## Prepare input dataset

In [7]:
data = [{
    "examples": [
        {
            "context": p
        },
    ],
} for p in pages[2].page_content.split("\n\n") if len(p) > 200]


In [8]:
data = data[-3:]
data


[{'instruction': 'Generate one question and its corresponding answer based on the context. Following the format of the examples below to include the same context, question, and answer in the response.',
  'examples': [{'context': 'In 1948, Claude E. Shannon published A Mathematical Theory of\nCommunication (Shannon, 1948) establishing the theory of\ninformation. In his article, Shannon introduced the concept of\ninformation entropy for the first time. We will begin our journey here.',
    'question': 'Who published A Mathematical Theory of Communication in 1948?',
    'answer': 'Claude E. Shannon.'},
   {'context': 'Any notion of information we develop must conform to this intuition.\nIndeed, in the next sections we will learn how to compute that these\nevents have \\(0\\textrm{ bits}\\), \\(2\\textrm{ bits}\\),\n\\(~5.7\\textrm{ bits}\\), and \\(~225.6\\textrm{ bits}\\) of\ninformation respectively.',
    'question': '',
    'answer': ''}]},
 {'instruction': 'Generate one question and

## Run ModelFlow

In [9]:
client = Client("ModelFlow")

In [10]:
output = client.run(data)

100%|██████████| 3/3 [00:05<00:00,  1.92s/it]


In [11]:
output

[{'output': [{'response': [{'context': 'Any notion of information we develop must conform to this intuition.\nIndeed, in the next sections we will learn how to compute that these\nevents have \\(0\\textrm{ bits}\\), \\(2\\textrm{ bits}\\),\n\\(~5.7\\textrm{ bits}\\), and \\(~225.6\\textrm{ bits}\\) of\ninformation respectively.',
      'question': 'What is the amount of information associated with the events mentioned in the next sections?',
      'answer': 'The events have 0 bits, 2 bits, 5.7 bits, and 225.6 bits of information respectively.'},
     {'context': 'Any notion of information we develop must conform to this intuition. Indeed, in the next sections we will learn how to compute that these events have \\(0\\textrm{ bits}\\), \\(2\\textrm{ bits}\\), \\(~5.7\\textrm{ bits}\\), and \\(~225.6\\textrm{ bits}\\) of information respectively.',
      'question': 'What is the amount of information associated with each of the events in the next sections?',
      'answer': 'The events ha

## Format result into pandas table

In [13]:
# Extracting context, question, and answer into a DataFrame
contexts = []
questions = []
answers = []

for item in output:
    for i in item['output']:
        for response in i['response']:
            contexts.append(response['context'])
            questions.append(response['question'])
            answers.append(response['answer'])

df = pd.DataFrame({
    'context': contexts,
    'question': questions,
    'answer': answers
})

df

Unnamed: 0,context,question,answer
0,Any notion of information we develop must conf...,What is the amount of information associated w...,"The events have 0 bits, 2 bits, 5.7 bits, and ..."
1,Any notion of information we develop must conf...,What is the amount of information associated w...,"The events have 0 bits, 2 bits, ~5.7 bits, and..."
2,Any notion of information we develop must conf...,How much information does each of the events h...,"The events have 0 bits, 2 bits, 5.7 bits, and ..."
3,"If we read through these thought experiments, ...",What does information represent in the context...,The degree of surprise or the abstract possibi...
4,"If we read through these thought experiments, ...",What does information represent in terms of th...,Information represents the degree of surprise ...
5,"If we read through these thought experiments, ...","According to the text, what does information r...",Information represents the degree of surprise ...
6,"In 1948, Claude E. Shannon published A Mathema...",What concept did Claude E. Shannon introduce f...,Information entropy.
7,"In 1948, Claude E. Shannon published A Mathema...",What concept did Claude E. Shannon introduce f...,Claude E. Shannon introduced the concept of in...
