# Example of generating QAs for an ML book (using self-instruct)
Source: https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/information-theory.html

## Load packages

In [1]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")

In [2]:
import os
import pandas as pd
from uniflow.client import Client
from langchain.document_loaders import UnstructuredHTMLLoader


  from .autonotebook import tqdm as notebook_tqdm


## Prepare the input data

Uncomment any of the html files below as the sample file to build the self-instruct flow.

In [3]:
#html_file = "do_things_that_dont_scale.html" #from http://paulgraham.com/ds.html
#html_file = "makers_schedule_managers_schedule.html" #from http://www.paulgraham.com/makersschedule.html
#html_file = "life_is_short.html" #http://www.paulgraham.com/vb.html
html_file = "22.11_information-theory.html"

Set current directory and input data directory.

In [4]:
dir_cur = os.getcwd()
input_file = os.path.join(f"{dir_cur}/data/raw_input/", html_file)

In [5]:
loader = UnstructuredHTMLLoader(input_file)
pages = loader.load_and_split()

## Prepare input dataset

In [12]:
data = [{"context": p} for p in pages[2].page_content.split("\n\n") if len(p) > 200]


In [13]:
data = data[-3:]
data


[{'context': 'Any notion of information we develop must conform to this intuition.\nIndeed, in the next sections we will learn how to compute that these\nevents have \\(0\\textrm{ bits}\\), \\(2\\textrm{ bits}\\),\n\\(~5.7\\textrm{ bits}\\), and \\(~225.6\\textrm{ bits}\\) of\ninformation respectively.'},
 {'context': 'If we read through these thought experiments, we see a natural idea. As\na starting point, rather than caring about the knowledge, we may build\noff the idea that information represents the degree of surprise or the\nabstract possibility of the event. For example, if we want to describe\nan unusual event, we need a lot information. For a common event, we may\nnot need much information.'},
 {'context': 'In 1948, Claude E. Shannon published A Mathematical Theory of\nCommunication (Shannon, 1948) establishing the theory of\ninformation. In his article, Shannon introduced the concept of\ninformation entropy for the first time. We will begin our journey here.'}]

## Run ModelFlow

In [14]:
client = Client("FewShotModelFlow")

In [15]:
output = client.run(data)

100%|██████████| 3/3 [00:03<00:00,  1.09s/it]


In [16]:
output

[{'output': [{'response': [{'context': 'Any notion of information we develop must conform to this intuition.\nIndeed, in the next sections we will learn how to compute that these\nevents have \\(0\\textrm{ bits}\\), \\(2\\textrm{ bits}\\),\n\\(~5.7\\textrm{ bits}\\), and \\(~225.6\\textrm{ bits}\\) of\ninformation respectively.',
      'question': 'how much information do the events have in terms of bits?',
      'answer': 'the events have 0 bits, 2 bits, 5.7 bits, and 225.6 bits of information respectively.'},
     {'context': 'Any notion of information we develop must conform to this intuition.\nIndeed, in the next sections we will learn how to compute that these\nevents have \\(0\\textrm{ bits}\\), \\(2\\textrm{ bits}\\),\n\\(~5.7\\textrm{ bits}\\), and \\(~225.6\\textrm{ bits}\\) of\ninformation respectively.',
      'question': 'the surrounding',
      'answer': 'indicates that one event has approximately 5.7 bits of information.'},
     {'context': 'Any notion of information we d

## Format result into pandas table

In [17]:
# Extracting context, question, and answer into a DataFrame
contexts = []
questions = []
answers = []

for item in output:
    for i in item['output']:
        for response in i['response']:
            contexts.append(response['context'])
            questions.append(response['question'])
            answers.append(response['answer'])

df = pd.DataFrame({
    'context': contexts,
    'question': questions,
    'answer': answers
})

df

Unnamed: 0,context,question,answer
0,Any notion of information we develop must conf...,how much information do the events have in ter...,"the events have 0 bits, 2 bits, 5.7 bits, and ..."
1,Any notion of information we develop must conf...,the surrounding,indicates that one event has approximately 5.7...
2,Any notion of information we develop must conf...,what is the amount of information for the even...,"the events have \(0\textrm{ bits}\), \(2\textr..."
3,"If we read through these thought experiments, ...",what represents the degree of surprise or the ...,information.
4,"If we read through these thought experiments, ...",what concept do thought experiments propose in...,the concept proposes that information represen...
5,"If we read through these thought experiments, ...",what is the relationship between information a...,information represents the degree of surprise ...
6,"In 1948, Claude E. Shannon published A Mathema...",who introduced the concept of information entr...,claude e. shannon.
7,"In 1948, Claude E. Shannon published A Mathema...",who introduced the concept of information entr...,claude e. shannon.
8,"In 1948, Claude E. Shannon published A Mathema...",who published a mathematical theory of communi...,claude e. shannon.
