In [1]:
from prompt_autotune import TunePrompt
from llama_index.llms.openai import OpenAI
import logging

logging.basicConfig(level=logging.INFO)

prompt = """Given an excerpt from a PDF, if you see contents listed with the page number in a table format, output the title of the contents and the start page and the end page of the contents.
Example:
1.1.5 Heading 1, 5, 5
1.1.6 Heading 2, 6, 6
1.1.7 Heading 3, 7, 8
where the first column is the title of the contents along with the unique identifier, the second column is the start page of the contents, and the third column is the end page of the contents.
Excerpt:"""

task = """As an advanced language model, your job is to extract every single entry in the table of contents section of document. You will be given a chunk of the Table of Contents at a time, and you will need to extract the title of the contents, the start page, and the end page of the contents.
You shouldn't miss anything. Get all the entries at all the hirearchical levels."""

# Create a new instance of TunePrompt
tune = TunePrompt(
    prompt=prompt,
    task=task,
    verbose=True,
    llm=OpenAI(model="gpt-3.5-turbo"),
    powerllm=OpenAI(model="gpt-3.5-turbo"),
)

INFO:root:Generating 10 examples for task: As an advanced ...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Completed


In [5]:
from prompt_autotune.GenerateExamples import Example

examples = []
# exampes are different tupes of edge cases in a table of contents
# example where the table of contents is in the format of "1.1.5 Heading 1, 5, 5"
examples.append(
    Example(
        input="1.1.5 Heading 1, 5, 5\n1.1.6 Heading 2, 6, 6\n1.1.7 Heading 3, 7, 8",
        output="Heading 1, 5, 5\nHeading 2, 6, 6\nHeading 3, 7, 8",
    )
)
# example where the table of contents is in the format of "1.1.5 Heading 1, 5-5"
examples.append(
    Example(
        input="1.1.5 Heading 1, 5-5\n1.1.6 Heading 2, 6-6\n1.1.7 Heading 3, 7-8",
        output="Heading 1, 5, 5\nHeading 2, 6, 6\nHeading 3, 7, 8",
    )
)
# example where the table of contents is in the format of "Section 1, 5-5"
examples.append(
    Example(
        input="Section 1, 5-5\nSection 2, 6-6\nSection 3, 7-8",
        output="Section 1, 5, 5\nSection 2, 6, 6\nSection 3, 7, 8",
    )
)
# example where the table of contents is in a hireralchical format, e.g. "I. Heading 1, 5-5\nA. Heading 2, 6-6\n1. Heading 3, 7-8"
examples.append(
    Example(
        input="I. Heading 1, 5-5\nA. Heading 2, 6-6\n1. Heading 3, 7-8",
        output="Heading 1, 5, 5\nHeading 2, 6, 6\nHeading 3, 7, 8",
    )
)
# example where the table of contents is not even in a table format, e.g. "Heading 1, 5-5\nHeading 2, 6-6\nHeading 3, 7-8"
examples.append(
    Example(
        input="Heading 1, 5-5\nHeading 2, 6-6\nHeading 3, 7-8",
        output="Heading 1, 5, 5\nHeading 2, 6, 6\nHeading 3, 7, 8",
    )
)
# example where the table of contents is in a format of "1.1.5 Heading 1, 5-5\n1.1.6 Heading 2, 6-6\n1.1.7 Heading 3, 7-8"
examples.append(
    Example(
        input="1.1.5 Heading 1, 5-5\n1.1.6 Heading 2, 6-6\n1.1.7 Heading 3, 7-8",
        output="Heading 1, 5, 5\nHeading 2, 6, 6\nHeading 3, 7, 8",
    )
)

tune.examples = examples


In [6]:
tune()

Example: None
Input: 1.1.5 Heading 1, 5, 5
1.1.6 Heading 2, 6, 6
1.1.7 Heading 3, 7, 8
Output: Heading 1, 5, 5
Heading 2, 6, 6
Heading 3, 7, 8
Example: None
Input: 1.1.5 Heading 1, 5-5
1.1.6 Heading 2, 6-6
1.1.7 Heading 3, 7-8
Output: Heading 1, 5, 5
Heading 2, 6, 6
Heading 3, 7, 8
Example: None
Input: Section 1, 5-5
Section 2, 6-6
Section 3, 7-8
Output: Section 1, 5, 5
Section 2, 6, 6
Section 3, 7, 8
Example: None
Input: I. Heading 1, 5-5
A. Heading 2, 6-6
1. Heading 3, 7-8
Output: Heading 1, 5, 5
Heading 2, 6, 6
Heading 3, 7, 8
Example: None
Input: Heading 1, 5-5
Heading 2, 6-6
Heading 3, 7-8
Output: Heading 1, 5, 5
Heading 2, 6, 6
Heading 3, 7, 8
Example: None
Input: 1.1.5 Heading 1, 5-5
1.1.6 Heading 2, 6-6
1.1.7 Heading 3, 7-8
Output: Heading 1, 5, 5
Heading 2, 6, 6
Heading 3, 7, 8


Generating responses for cycle 0
INFO:prompt_autotune.TunePrompt:Generating responses for cycle 0
  0%|          | 0/6 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 17%|█▋        | 1/6 [00:01<00:07,  1.47s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 33%|███▎      | 2/6 [00:02<00:04,  1.15s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 50%|█████     | 3/6 [00:04<00:05,  1.80s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 67%|██████▋   | 4/6 [00:06<00:02,  1.50s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 83%|████████▎ | 5/6 [00:07<00:01,  1.43s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
100%|██████████| 6/6 [00:09<00:00,  1.66s/it]
Evaluating responses for cycle 0
INFO:prompt_aut

In [9]:
from IPython.display import Markdown, display
display(Markdown(tune.prompt))

Given a chunk of the Table of Contents from a PDF, extract the title of the contents, the start page, and the end page of each entry in a consistent format. Ensure that the page ranges are standardized with hyphens. Differentiate between hierarchical and non-hierarchical entries. Support alphanumeric characters and Roman numerals in entries. Skip or flag missing/incomplete entries for review. Handle nested levels by indenting or numbering sub-level entries for clarity.

In [3]:
from llama_index.llms.openai import OpenAI
from prompt_autotune import TunePrompt

prompt = """Reply with 'yes' if the attached excerpt from a PDF contains list of contents, and 'no' otherwise.
Excerpt:"""

task = """As an advanced language model, your job is to determine if the given excerpt from a PDF contains a list of contents. You will be given a chunk of text from a PDF, and you will need to determine if the text contains a list of contents. If it does, you should reply with 'yes', and if it doesn't, you should reply with 'no'."""

# Create a new instance of TunePrompt
tune = TunePrompt(
    prompt=prompt,
    task=task,
    verbose=True,
    llm=OpenAI(model="gpt-3.5-turbo"),
    powerllm=OpenAI(model="gpt-3.5-turbo"),
)

from prompt_autotune.GenerateExamples import Example

In [5]:
examples = []
# examples of a table of contents
examples.append(Example(input="1.1.5 Heading 1, 5-5\n1.1.6 Heading 2, 6-6\n1.1.7 Heading 3, 7-8", output="yes"))
examples.append(Example(input="Section 1, 5-5\nSection 2, 6-6\nSection 3, 7-8", output="yes"))
examples.append(Example(input="I. Heading 1, 5-5\nA. Heading 2, 6-6\n1. Heading 3, 7-8", output="yes"))
examples.append(Example(input="Heading 1, 5-5\nHeading 2, 6-6\nHeading 3, 7-8", output="no"))
examples.append(Example(input="1.1.5 Heading 1, 5-5\n1.1.6 Heading 2, 6-6\n1.1.7 Heading 3, 7-8", output="yes"))

tune.examples = examples

tune()

Example: None
Input: 1.1.5 Heading 1, 5-5
1.1.6 Heading 2, 6-6
1.1.7 Heading 3, 7-8
Output: yes
Example: None
Input: Section 1, 5-5
Section 2, 6-6
Section 3, 7-8
Output: yes
Example: None
Input: I. Heading 1, 5-5
A. Heading 2, 6-6
1. Heading 3, 7-8
Output: yes
Example: None
Input: Heading 1, 5-5
Heading 2, 6-6
Heading 3, 7-8
Output: no
Example: None
Input: 1.1.5 Heading 1, 5-5
1.1.6 Heading 2, 6-6
1.1.7 Heading 3, 7-8
Output: yes


Generating responses for cycle 0
INFO:prompt_autotune.TunePrompt:Generating responses for cycle 0
100%|██████████| 5/5 [00:04<00:00,  1.15it/s]
Evaluating responses for cycle 0
INFO:prompt_autotune.TunePrompt:Evaluating responses for cycle 0
Old prompt: Reply with 'yes...
INFO:prompt_autotune.TunePrompt:Old prompt: Reply with 'yes...
New prompt: Reply with 'yes...
INFO:prompt_autotune.TunePrompt:New prompt: Reply with 'yes...
Generating responses for cycle 1
INFO:prompt_autotune.TunePrompt:Generating responses for cycle 1
100%|██████████| 5/5 [00:01<00:00,  2.89it/s]
Evaluating responses for cycle 1
INFO:prompt_autotune.TunePrompt:Evaluating responses for cycle 1
Old prompt: Reply with 'yes...
INFO:prompt_autotune.TunePrompt:Old prompt: Reply with 'yes...
New prompt: Reply with 'yes...
INFO:prompt_autotune.TunePrompt:New prompt: Reply with 'yes...
Generating responses for cycle 2
INFO:prompt_autotune.TunePrompt:Generating responses for cycle 2
100%|██████████| 5/5 [00:03<00:00,  1.36it

In [7]:
from IPython.display import Markdown, display
display(Markdown(tune.prompt))

Reply with 'yes' if the provided text exhibits characteristics commonly found in lists of contents, such as variations in numbering or lettering formats like numbers, letters, Roman numerals, etc., or includes keywords like "Table of Contents" or "Contents". Consider hierarchical structures, patterns, and mixed formats of list items to distinguish between regular headings and a list of contents. If unsure, provide feedback on why the text may or may not be a list of contents.