In [1]:
from pdfparser import PDFParser
import asyncio
import json
from ollama import Client
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path
import os
import re

In [2]:
from testiii import metadata

In [3]:
a = PDFParser("test.pdf")

In [6]:
b = asyncio.run(a.parse_job())

Started parsing the file under job_id e1c8429d-7ced-4686-a650-da23aabcf82a
....

In [71]:
for i in b:
    print(i.text)

# Edexcel International GCSE (9-1) Science Double Award

## Student Book

### Authors:
- Brian Arnold
- Phil Bradfield
- Jim Clark
- Penny Johnson
- Steve Owen
- Steve Potter
- Steve Woolley
- Rachel Yu

----

This textbook is designed for students preparing for the Edexcel International GCSE (9-1) in Science Double Award. It provides a comprehensive overview of the key concepts and topics required for the examination, ensuring that students are well-equipped for their studies.

### Key Features:
- **Engaging Content**: The book includes a variety of text blocks, images, and highlighted terms to enhance understanding.
- **Structured Learning**: Each chapter is organized to facilitate easy navigation through the material.
- **Visual Aids**: Diagrams and illustrations are used throughout to support learning and retention of information.

### Study Tips:
- **Review Regularly**: Make a habit of revisiting key concepts to reinforce your understanding.
- **Practice Questions**: Utilize the e

In [73]:
client = Client(host="http://localhost:11434")

In [74]:
classifier_front_content_agent_prompt = '''You are a document classifier for a GCSE-I level {subject} textbook. Your task is to categorize text into exactly one of these two categories:

FRONT_MATTER: Book-level introductory content including:
- Title page, copyright information, author lists
- Table of contents and ALL chapter listings
- Course structure and chapter outlines
- Book introductions, assessment information
- ANY page that ONLY lists topics or chapters, even if they include scientific terms
Key indicator: Page ONLY lists or organizes content, does NOT explain concepts
- "Book usage guides, experiment methodology overviews"
- "General skill development sections"
- "Cross-chapter reference systems (icons, progression scales)"

CHAPTER_CONTENT: Subject-matter teaching material including:
- Pages that EXPLAIN scientific concepts
- Pages with actual scientific definitions and descriptions
- Pages containing practice questions or exercises
- Pages with detailed diagrams and their explanations
Key indicator: Page must TEACH or EXPLAIN science, not just list topics

CRITICAL DISTINCTION:
- If the page explains HOW TO USE THE BOOK → FRONT_MATTER
- If the page explains SCIENCE CONCEPTS → CHAPTER_CONTENT

Remember: A table of contents or chapter outline is ALWAYS FRONT_MATTER, even if it contains scientific terms in the titles.

RESPOND WITH EXACTLY ONE WORD: FRONT_MATTER or CHAPTER_CONTENT'''

classifier_content_chapter_agent_prompt = '''You are a {subject} textbook content verifier. Determine if this text belongs to {chapterName}.

CONTEXT:
This is a GCSE-I level {subject} textbook chapter {chapterNumber} about {chapterName}.

CONTENT MATCHING RULES:
1. Content must actively teach material from {chapterName}, including:
   - Direct explanations of {chapterName} concepts
   - Examples and applications of {chapterName} topics
   - Practice questions/activities about {chapterName}
   - Relevant diagrams, figures, or experiments (e.g., “FIGURE {chapterNumber}.6”)

2. Reject content if it:
   - Teaches concepts from other chapters
   - Contains general book information
   - Is not directly related to {chapterName}
   - Refers to diagrams or figures that are irrelevant to {chapterName}
   - Describes general book features/experiment frameworks
   - Contains cross-chapter reference systems
   - Discusses exam skills without teaching specific {chapterName} content

3. Check for specific {chapterName} indicators:
   - Related concepts and explanations
   - Relevant examples and applications
   - Topic-specific diagrams, figures, or activities

4. Verify references to figures or graphs:
   - Ensure the figure or graph (e.g., “FIGURE {chapterNumber}.6”) clearly relates to {chapterName}
   - Any figure or graph should illustrate or support concepts from {chapterName}

RESPOND WITH EXACTLY ONE WORD: TRUE or FALSE
'''


classifier_content_back_matter_agent_prompt = '''You are a document classifier for a GCSE-I level {subject} textbook. Your task is to categorize text into exactly one of these two categories:

CHAPTER_CONTENT: Subject-matter teaching material including:
- Main educational content explaining scientific concepts
- Detailed explanations and descriptions of topics
- Practice questions within chapters
- Step-by-step procedures or experiments
- In-chapter examples and solutions
- Diagrams with explanatory text
Key indicator: Text TEACHES or EXPLAINS core subject concepts

BACK_MATTER: End-of-book reference material including:
- Answer keys to chapter questions
- Glossary of terms
- Index pages
- References and bibliography
- Appendices with supplementary data
- End-of-book summary tables
Key indicator: Text REFERENCES or INDEXES content rather than teaching it

CRITICAL DISTINCTION:
- If the page TEACHES or EXPLAINS concepts → CHAPTER_CONTENT
- If the page only LISTS, REFERENCES, or PROVIDES ANSWERS → BACK_MATTER

Remember: Answer keys and reference materials are ALWAYS BACK_MATTER, even if they contain detailed solutions.

RESPOND WITH EXACTLY ONE WORD: CHAPTER_CONTENT or BACK_MATTER'''


content_reformatter_prompt = '''You are a textbook content formatter. Your task is to reformat educational text to be clear, structured, and suitable for vector storage and retrieval.

FORMATTING RULES:
1. Create clear, complete sentences and paragraphs:
   - Combine related fragments into full sentences
   - Group related sentences into coherent paragraphs
   - Remove redundant whitespace and formatting
   - Remove all meta-commentary (like "In this chapter", "You will learn")
   - Remove concluding summary statements
   - Remove introductory phrases

2. Maintain hierarchical structure:
   - Keep section titles on separate lines
   - Preserve subsection headers
   - Remove learning objectives lists
   - Maintain only direct educational content flow

3. Clean up text by:
   - Converting bullet points into complete sentences
   - Converting tables into paragraph form
   - Making implicit relationships explicit
   - Removing page numbers and irrelevant markers
   - Removing any transitional phrases between sections

4. Ensure each paragraph:
   - Contains only factual content
   - Removes any reader-directed language
   - Maintains scientific/educational tone without commentary
   - Avoids summarizing or concluding statements

IMPORTANT: Output ONLY the reformatted content. Do not include any notes about what changes were made or processing steps taken. Return ONLY the cleaned, reformatted educational text.'''

In [75]:
def llm_agent(prompt, text):
    response = client.chat(model='pdfParser', messages=[
        {
            'role': 'system',
            'content': prompt
        },
        {
            'role': 'user',
            'content': text
        }
    ])
    output = response['message']['content']

    cleaned_content = re.sub(r"<think>.*?</think>\n+", "", output, flags=re.DOTALL)
    return cleaned_content.strip().upper()

In [76]:
def classify_front_matter_or_chapter(text: str, subject: str) -> str:
    """
    Determines if the text is FRONT_MATTER or CHAPTER_CONTENT.
    Returns one of: "FRONT_MATTER" or "CHAPTER_CONTENT".
    """
    print(classifier_front_content_agent_prompt.format(subject=subject))
    classified_output = llm_agent(
        classifier_front_content_agent_prompt.format(subject=subject),
        text
    )
    
    # Validate the output to ensure it's one word
    if len(classified_output.split()) != 1:
        classified_output = "CHAPTER_CONTENT"
    
    return classified_output

In [63]:
def is_content_about_chapter(text: str, subject: str, chapter_name: str, chapter_number: int) -> bool:
    """
    Checks if 'text' is about the given 'chapter_name'.
    Returns True/False, corresponding to "TRUE"/"FALSE" from LLM.
    """
    print(classifier_content_chapter_agent_prompt.format(subject=subject,chapterName=chapter_name,chapterNumber=chapter_number))
    bool_flag = llm_agent(
        classifier_content_chapter_agent_prompt.format(
            subject=subject,
            chapterName=chapter_name,
            chapterNumber=chapter_number
        ),
        text
    )
    
    # Normalize the response
    if bool_flag not in ["TRUE", "FALSE"]:
        bool_flag = "TRUE"
    
    return (bool_flag == "TRUE")

In [65]:
def classify_back_matter_or_chapter(text: str, subject: str) -> str:
    """
    Determines if the text is BACK_MATTER or CHAPTER_CONTENT.
    Returns one of: "BACK_MATTER" or "CHAPTER_CONTENT".
    """
    print(classifier_content_back_matter_agent_prompt.format(subject=subject))
    classified_output = llm_agent(
        classifier_content_back_matter_agent_prompt.format(subject=subject),
        text
    )
    
    # Validate the output to ensure it's one word
    if len(classified_output.split()) != 1:
        classified_output = "BACK_MATTER"
    
    return classified_output

In [77]:
def reformat_content(text: str) -> str:
    """
    Reformats the content to satisfy the rules in 'content_reformatter_prompt'.
    """
    return llm_agent(content_reformatter_prompt, text)

In [79]:
def write_content_to_chapter(chapter_idx: int, chapters: list, content_reformatted: str, book_dir: Path) -> None:
    """
    Writes the reformatted content to the appropriate chapter file. If the file
    already exists, it appends; otherwise, it creates a new file.
    """
    chapter_file = book_dir / f"chapter_{chapters[chapter_idx]['chapterNumber']}.txt"
    if chapter_file.exists():
        mode = 'a'
    else:
        mode = 'w'
    
    with open(chapter_file, mode) as f:
        f.write(content_reformatted + '\n')
        f.write("SKIP" + '\n')

In [80]:
def process_textbook_content(content_list, metadata):
    """
    Iterates over each page's text in content_list and classifies it
    into front matter, chapter content, or back matter. Writes
    reformatted text to chapter files accordingly.
    """
    current_chapter_idx = -1  # Use -1 to indicate front matter state
    chapters = metadata["textbook"]["chapters"]
    
    # But for script we use the below:
    # book_dir = Path(__file__).parent.parent / "book"
    # book_dir.mkdir(exist_ok=True)
    
    # Create the book directory (Jupyter Version)
    current_dir = Path.cwd()
    parent_dir = current_dir.parent
    book_dir = parent_dir / "book"
    book_dir.mkdir(exist_ok=True)
    
    for page, content in enumerate(content_list):
        # 1) Skip empty pages
        raw_text = content.text.strip()
        if not raw_text:
            continue
        reformatted_text = reformat_content(raw_text)
        # 2) Now classify the text
        if current_chapter_idx == -1:
            # Still in front matter territory: front or first chapter
            front_or_chapter = classify_front_matter_or_chapter(
                raw_text,
                metadata['textbook']['subject']
            )
            if front_or_chapter == "CHAPTER_CONTENT":
                # Check if it's about the first chapter (index 0)
                if is_content_about_chapter(
                    reformatted_text,
                    metadata['textbook']['subject'],
                    chapters[0]["title"],
                    chapters[0]["chapterNumber"]
                ):
                    current_chapter_idx = 0
                    write_content_to_chapter(
                        current_chapter_idx,
                        chapters,
                        reformatted_text,
                        book_dir
                    )
                    
        elif current_chapter_idx < len(chapters):
            # We are inside a known chapter. Check if it is about the current chapter.
            if is_content_about_chapter(
                reformatted_text,
                metadata['textbook']['subject'],
                chapters[current_chapter_idx]["title"],
                chapters[current_chapter_idx]["chapterNumber"]
            ):
                # Still the same chapter
                write_content_to_chapter(
                    current_chapter_idx,
                    chapters,
                    reformatted_text,
                    book_dir
                )
            else:
                # Possibly next chapter or back matter
                # Try the next chapter if there is one
                if current_chapter_idx + 1 < len(chapters):
                    if is_content_about_chapter(
                        reformatted_text,
                        metadata['textbook']['subject'],
                        chapters[current_chapter_idx + 1]["title"],
                        chapters[current_chapter_idx + 1]["chapterNumber"]
                    ):
                        # Move on to the next chapter
                        current_chapter_idx += 1
                        write_content_to_chapter(
                            current_chapter_idx,
                            chapters,
                            reformatted_text,
                            book_dir
                        )
                    else:
                        # Not about the next chapter → default to current
                        write_content_to_chapter(
                            current_chapter_idx,
                            chapters,
                            reformatted_text,
                            book_dir
                        )
                else:
                    # If we're at the last chapter, check if it's back matter
                    back_or_chapter = classify_back_matter_or_chapter(
                        reformatted_text,
                        metadata['textbook']['subject']
                    )
                    if back_or_chapter == "BACK_MATTER":
                        break
                    else:
                        # If it's "CHAPTER_CONTENT", we append it to the current (last) chapter
                        write_content_to_chapter(
                            current_chapter_idx,
                            chapters,
                            reformatted_text,
                            book_dir
                        )

        else:
            # We've run out of chapters (current_chapter_idx is the last)
            # Let's see if it's back matter or still chapter content.
            back_or_chapter = classify_back_matter_or_chapter(
                reformatted_text,
                metadata['textbook']['subject']
            )
            if back_or_chapter == "CHAPTER_CONTENT":
                # Check if it still relates to the final chapter
                if is_content_about_chapter(
                    raw_text,
                    metadata['textbook']['subject'],
                    chapters[current_chapter_idx]["title"],
                    chapters[current_chapter_idx]["chapterNumber"]
                ):
                    write_content_to_chapter(
                        current_chapter_idx,
                        chapters,
                        reformatted_text,
                        book_dir
                    )
            else:
                # If it's definitely back matter, we stop or break out
                break

In [None]:
process_textbook_content(b, metadata)

You are a document classifier for a GCSE-I level Biology textbook. Your task is to categorize text into exactly one of these two categories:

FRONT_MATTER: Book-level introductory content including:
- Title page, copyright information, author lists
- Table of contents and ALL chapter listings
- Course structure and chapter outlines
- Book introductions, assessment information
- ANY page that ONLY lists topics or chapters, even if they include scientific terms
Key indicator: Page ONLY lists or organizes content, does NOT explain concepts
- "Book usage guides, experiment methodology overviews"
- "General skill development sections"
- "Cross-chapter reference systems (icons, progression scales)"

CHAPTER_CONTENT: Subject-matter teaching material including:
- Pages that EXPLAIN scientific concepts
- Pages with actual scientific definitions and descriptions
- Pages containing practice questions or exercises
- Pages with detailed diagrams and their explanations
Key indicator: Page must TEACH

In [415]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="bge-m3")



In [416]:
embeddings

OllamaEmbeddings(model='bge-m3', base_url=None, client_kwargs={})

In [417]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="textbook_docs",
    embedding_function=embeddings,
)

In [422]:
vector_store

<langchain_chroma.vectorstores.Chroma at 0x16a2e6270>