# Example of generating QAs for an ML book (using self-instruct)
Source: https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/information-theory.html

### Load packages

In [1]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")

In [2]:
import os
import pandas as pd
from uniflow.client import Client
from uniflow.config import Config
from uniflow.model.config import OpenAIModelConfig
from langchain.document_loaders import PyPDFLoader


  from .autonotebook import tqdm as notebook_tqdm


### Prepare the input data

Uncomment any of the pdf files below as the sample file to build the self-instruct flow.

In [3]:
pdf_file = "makers_schedule_managers_schedule.pdf"

Set current directory and input data directory.

In [4]:
dir_cur = os.getcwd()
input_file = os.path.join(f"{dir_cur}/data/raw_input/", pdf_file)

In [5]:
loader = PyPDFLoader(input_file)
pages = loader.load_and_split()

In [6]:
data = [{
    "instruction": """Generate one question and its corresponding answer based on the context. Following the format of the examples below to include the same context, question, and answer in the response.""",
    "examples": [
        {
            "context": """In 1948, Claude E. Shannon published A Mathematical Theory of\nCommunication (Shannon, 1948) establishing the theory of\ninformation. In his article, Shannon introduced the concept of\ninformation entropy for the first time. We will begin our journey here.""",
            "question": """Who published A Mathematical Theory of Communication in 1948?""",
            "answer": """Claude E. Shannon."""
        },
        {
            "context": p,
            "question": """""",
            "answer": """""",
        }
    ],
} for p in pages[0].page_content.split("\n\n") if len(p) > 200]


In [7]:
data

[{'instruction': 'Generate one question and its corresponding answer based on the context. Following the format of the examples below to include the same context, question, and answer in the response.',
  'examples': [{'context': 'In 1948, Claude E. Shannon published A Mathematical Theory of\nCommunication (Shannon, 1948) establishing the theory of\ninformation. In his article, Shannon introduced the concept of\ninformation entropy for the first time. We will begin our journey here.',
    'question': 'Who published A Mathematical Theory of Communication in 1948?',
    'answer': 'Claude E. Shannon.'},
   {'context': '11/6/23, 11:40 AM Maker\'s Schedule, Manager\'s Schedule\nhttps://www.paulgraham.com/makersschedule.html 1/3\n"...the mere consciousness of an engagement will sometimes\nworry a whole da y."\n– Charles Dick ens\nJuly 2009\nOne reason progr ammers dislik e meetings so much is that they\'re\non a different t ype of schedule from other people. Meetings cost\nthem more.\nThere 

In [8]:
config = Config(model_config=OpenAIModelConfig())
client = Client(config)

In [9]:
output = client.run(data)

100%|██████████| 1/1 [00:13<00:00, 13.16s/it]


In [10]:
output


[{'output': [{'response': [{'context': '11/6/23, 11:40 AM Maker\'s Schedule, Manager\'s Schedule\nhttps://www.paulgraham.com/makersschedule.html 1/3\n"...the mere consciousness of an engagement will sometimes\nworry a whole day."\n– Charles Dickens\nJuly 2009\nOne reason programmers dislike meetings so much is that they\'re\non a different type of schedule from other people. Meetings cost\nthem more.\nThere are two types of schedule, which I\'ll call the manager\'s\nschedule and the maker\'s schedule. The manager\'s schedule is for\nbosses. It\'s embodied in the traditional appointment book, with\neach day cut into one hour intervals. You can block off several\nhours for a single task if you need to, but by default you change\nwhat you\'re doing every hour.\nWhen you use time that way, it\'s merely a practical problem to\nmeet with someone. Find an open slot in your schedule, book\nthem, and you\'re done.\nMost powerful people are on the manager\'s schedule. It\'s the\nschedule of comm

In [12]:
# Extracting context, question, and answer into a DataFrame
contexts = []
questions = []
answers = []

for item in output:
    for i in item['output']:
        for response in i['response']:
            contexts.append(response['context'])
            questions.append(response['question'])
            answers.append(response['answer'])

df = pd.DataFrame({
    'context': contexts,
    'question': questions,
    'answer': answers
})

df.head()

Unnamed: 0,context,question,answer
0,"11/6/23, 11:40 AM Maker's Schedule, Manager's ...","According to the article, what are the two typ...",The manager's schedule and the maker's schedule.
