## Installs and Imports

In [None]:
!pip install PyPDF2
!pip install openai
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain
!pip install scikit-learn

In [None]:
import os
from PyPDF2 import PdfReader, PdfWriter
from openai import OpenAI
from openai.types.beta.threads.message_create_params import Attachment, AttachmentToolFileSearch
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from sklearn.metrics.pairwise import cosine_similarity
import math
from langchain.load import dumps, loads
from operator import itemgetter
from dataclasses import dataclass
from typing import List, Optional, Dict

## Configuration

In [None]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = 'your_langchain_api_key'
os.environ['OPENAI_API_KEY'] = 'your_openai_api_key'

## Important Note

## LLM-Based PDF-to-Text Conversion (Quality > Traditional Approaches)

#### Split PDF Into Individual Pages (To Overcome Limited Context Windows)

In [None]:
output_folder = "split_pages"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

rag_pdf = PdfReader('rag.pdf')

for page in range(len(rag_pdf.pages)):
    pdf_writer = PdfWriter()
    pdf_writer.add_page(rag_pdf.pages[page])

    output_filename = os.path.join(output_folder, f"rag-{page + 1}.pdf")

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

# At This Point, Look at the Pages. Delete The Last One Manually As It Is Blank. It Behaves Weirdly For Some Reason - I Will Fix This Afterwards Through Extra Prompting for Blankness -> Boolean Test

#### Checks If the LLM Response Indicates an Error

In [None]:
def is_error_message_with_chatgpt(response):
    prompt = (
        f"Does the following text read like an error or technical issue? "
        f"Reply with 'Yes' or 'No' only. (do not reply with anything else at all)\n\n{response}"
    )

    client = OpenAI(
        api_key = os.environ['OPENAI_API_KEY']
    )
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-4-1106-preview",
    )

    return chat_completion.choices[0].message.content.strip().lower() == 'yes'

#### Converts a Single Page into Text, Appending This to out.txt

In [None]:
def fileToText(file_path):
    client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

    def get_assistant():
        return client.beta.assistants.create(
            model='gpt-4-1106-preview',
            description='You are a data retrieval assistant.',
            instructions="Return the COMPLETE content from the provided file, preserving all mathematical notation and formatting.",
            tools=[{"type": "file_search"}],
            name='Maths PDF Reader',
        )
    
    file = client.files.create(
        file=open(file_path, 'rb'),
        purpose='assistants'
    )
    
    thread = client.beta.threads.create()
    prompt = "This is a mathematical document that may contain diagrams, LaTeX, and complex formatting. Please: 1. Extract ALL readable text content (Your goal is to convert the entire PDF to a text output that is as close a copy as possible) 2. Preserve mathematical notation where possible (convert LaTeX to an appropriate format) 3. Ignore diagrams and figures 4. Maintain section numbering and structure, EXCEPT the running headers and page numbers. Please ignore the running headers (which are in the top corners) and page numbers (also at the top, but the other corner). Remember, just read the file, don't add any extra comments."
    
    client.beta.threads.messages.create(
        thread_id = thread.id,
        role='user',
        content=prompt,
        attachments=[Attachment(file_id=file.id, tools=[AttachmentToolFileSearch(type='file_search')])]
    )
    
    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id,
        assistant_id=get_assistant().id,
        timeout=300,
        tools=[{"type": "file_search"}],
    )
    
    if run.status != "completed":
        print('Error Upon Reading... Retrying')
        fileToText(file_path)
        return
    
    messages_cursor = client.beta.threads.messages.list(thread_id=thread.id)
    messages = [message for message in messages_cursor]
    
    message = messages[0]
    assert message.content[0].type == "text"
    
    res_txt = message.content[0].text.value + '\n'

    if is_error_message_with_chatgpt(res_txt):
        print('Error Upon Reading... Retrying')
        fileToText(file_path)
        return

    with open('out_2.txt', 'a', encoding='utf-8') as test_file:
        test_file.write(res_txt)
    
    client.files.delete(file.id)

#### Iterates Through All Pages for Conversion (Slow, But This is One-Time)

In [None]:
files = os.listdir('split_pages')
sorted_files = sorted(
    files,
    key=lambda x: int(re.search(r'(\d+)', x).group(0)) if re.search(r'(\d+)', x) else 0
)

for file in sorted_files:
    file_path = os.path.join('split_pages', file)
    fileToText(file_path)
    print(f'{file_path} Processed')

# Final RAG Model (Not General, But There Are Some General Models Below and in the Ideas for the Future Section)

### Uses Multi-Query + RAG Tree-Based Parser + Routing Example + > Near-Maximum Cosine Similarity + Keeps History of Last 2 Messages

### Go to the Final ChatBot Part to Interact With It (Just Before Ideas for the Future). Until That ChatBot Cell, Run Every Previous Cell In Order From the Beginning of the Whole Notebook.

#### Pre-Processing to Remove Running Headers + Page Numbers

In [None]:
def remove_running_headers(input_file, output_file):
    with open(input_file, "r", encoding = 'utf-8') as infile, open(output_file, "w", encoding = 'utf-8') as outfile:
        for line in infile:
            if re.match(r'^\s*CHAPTER\s+\d+', line):
                continue
            if re.match(r'^\s*[A-Z0-9\s\.\-]+$', line.strip()):
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()):
                continue
            outfile.write(line)

remove_running_headers("out_2.txt", "cleaned_output.txt")

#### Creates a Tree Based on the Structure of Maths Notes

In [None]:
class MathNode:
    def __init__(self, content, node_type, identifier, parent=None, children=None):
        self.content = content
        self.node_type = node_type
        self.identifier = identifier
        self.parent = parent
        self.children = children or []

class MathDocumentParser:
    def __init__(self):
        self.patterns = {
            'chapter': r'Chapter\s+(\d+)',
            'section': r'(\d+\.\d+)\s+([^\n]+)',
            'subsection': r'(\d+\.\d+\.\d+)\s+([^\n]+)',
            'theorem': r'Theorem\s+(\d+)',
            'lemma': r'Lemma\s+(\d+)',
            'proposition': r'Proposition\s+(\d+)',
            'definition': r'Definition\s+(\d+)',
            'example': r'Example\s+(\d+)',
            'proof': r'Proof(?:\s+of\s+(?:Theorem|Lemma|Proposition)\s+\d+)?'
        }
        self.root = None

    def parse(self, text: str):
        self.root = MathNode("Document Root", "root", "root")
        current_chapter = None
        current_section = None
        current_subsection = None
        
        lines = text.split('\n')
        current_content = []
        current_node = None
        
        for line in lines:
            matched = False
            chapter_match = re.match(self.patterns['chapter'], line)
            if chapter_match:
                if current_node:
                    current_node.content = '\n'.join(current_content).strip()
                current_chapter = MathNode(
                    content="",
                    node_type="chapter",
                    identifier=f"Chapter {chapter_match.group(1)}",
                    parent=self.root
                )
                self.root.children.append(current_chapter)
                current_node = current_chapter
                current_section = None
                current_subsection = None
                current_content = []
                matched = True
                continue
                
            section_match = re.match(self.patterns['section'], line)
            if section_match and current_chapter:
                if current_node:
                    current_node.content = '\n'.join(current_content).strip()
                current_section = MathNode(
                    content="",
                    node_type="section",
                    identifier=section_match.group(1),
                    parent=current_chapter
                )
                current_chapter.children.append(current_section)
                current_node = current_section
                current_subsection = None
                current_content = []
                matched = True
                continue

            subsection_match = re.match(self.patterns['subsection'], line)
            if subsection_match and current_section:
                if current_node:
                    current_node.content = '\n'.join(current_content).strip()
                current_subsection = MathNode(
                    content="",
                    node_type="subsection",
                    identifier=subsection_match.group(1),
                    parent=current_section
                )
                current_section.children.append(current_subsection)
                current_node = current_subsection
                current_content = []
                matched = True
                continue

            for content_type, pattern in self.patterns.items():
                if content_type in ['chapter', 'section', 'subsection']:
                    continue
                    
                match = re.match(pattern, line)
                if match:
                    if current_node:
                        current_node.content = '\n'.join(current_content).strip()
                    
                    parent = current_subsection if current_subsection else current_section
                    if not parent:
                        continue
                        
                    new_node = MathNode(
                        content="",
                        node_type=content_type,
                        identifier=f"{content_type.capitalize()} {match.group(1)}" if content_type != 'proof' else "Proof",
                        parent=parent
                    )
                    parent.children.append(new_node)
                    current_node = new_node
                    current_content = []
                    matched = True
                    break
            
            if not matched and current_node:
                current_content.append(line)

        if current_node:
            current_node.content = '\n'.join(current_content).strip()

#### Both Prints a Summary of the Tree AND Creates the Tree Instance

In [None]:
def print_tree(node, level = 0):
    """Print the tree in a readable format."""
    indent = "  " * level
    print(f"{indent}├── {node.node_type}: {node.identifier}")
    if node.content:
        content_preview = node.content.replace('\n', ' ')[:50]
        if len(node.content) > 50:
            content_preview += "..."
        print(f"{indent}│   Content: {content_preview}")
    for child in node.children:
        print_tree(child, level + 1)

parser = MathDocumentParser()

with open('cleaned_output.txt', 'r', encoding='utf-8') as f:
    content = f.read()

parser.parse(content)
print("\nDocument Structure:")
print_tree(parser.root)

#### Only the Leaves of the Tree are Document Nodes

In [None]:
def get_leaf_contents(node):
    leaf_contents = []
    
    def collect_leaves(node_2):
        if not node_2.children: 
            if node_2.content:
                leaf_contents.append(Document(page_content = node_2.content))
        else:
            for child in node_2.children:
                collect_leaves(child)
    
    collect_leaves(node)
    return leaf_contents

splits_leaves = get_leaf_contents(parser.root)

#### This is Multi-Query

In [None]:
def generate_sub_queries(query):
    sub_query_prompt = f"""Break down this mathematical query into smaller, related queries that would help build a complete answer.
Query: {query}

Let's generate specific sub-queries to find:
1. The direct definition/axioms
2. Related properties mentioned elsewhere
3. Any additional conditions or properties that combine with these
4. Any special cases or extensions

I need specific sub-queries. For example, given a Query: List the axioms of the reals, you should return something similar to:
What are the field axioms that the reals satisfy?,
What are the ordering properties of the reals?,
What additional properties are mentioned about the real numbers?,
How are the reals defined in terms of fields and ordering?

Do not try to answer the Query at all. Just create sub-queries (so questions that help to build up a larger picture. I'm using you for a RAG tool and this is multi-query)
Also, when you generate sub-queries, do not use your own knowledge AT ALL. Just stick to sub-queries that generalize the query based on knowledge of English and NOT of maths. Remember, this is a RAG tool.
To re-iterate, don't use your maths knowledge AT ALL. 
Just don't use your own knowledge of the Query AT ALL.

Importantly, you are guiding this strategy. So as the master of this RAG tool, you should 3 choose sub-queries that will be able to gather information from disparate parts of the input file.

Just don't use your maths knowledge AT ALL. 
Just don't use your own knowledge of the Query AT ALL.


Generate 3 sub-queries:"""

    # Can probably optimise the prompt naturally
    
    client = OpenAI(
        api_key = os.environ['OPENAI_API_KEY']
    )
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": sub_query_prompt,
            }
        ],
        model="gpt-4-1106-preview",
    )

    qs = chat_completion.choices[0].message.content
    return [q.strip().split('. ', 1)[1] if '. ' in q else q.strip() for q in qs.split('\n')]


#### Example of Routing (This is a Proof-of-Concept More Than Anything). Just Determines Whether a Query Asks for a Proof or Not.

In [None]:
def classify_math_query(query):
    
    classification_prompt = f"""Determine if this mathematical query is asking for a proof or something else.

    Think step by step:
    1. Is this asking for a proof/demonstration of why something is true?
    2. Or is it asking for something else (definition, explanation, etc.)?
    
    Query: {query}
    
    Return ONLY 'proof' or 'other'."""

    client = OpenAI(
        api_key = os.environ['OPENAI_API_KEY']
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": classification_prompt,
            }
        ],
        model="gpt-4-1106-preview"
    )
    return chat_completion.choices[0].message.content.strip().lower()

#### Retrieval Algorithm for Non-Proofs (Again, Arbitrary)

In [2]:
def get_sorted_documents_without_neighbors(documents, query):
    query_embedding = embd.embed_query(query)
    cos_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    doc_with_scores = [(doc, cos_sim[i]) for i, doc in enumerate(documents)]
    sorted_docs = sorted(doc_with_scores, key=lambda x: x[1], reverse=True)
    threshold = math.floor((sorted_docs[0][1] - 0.01) / 0.05) * 0.05 # This works, but can vary it depending on the query
                                                            # Or do some routing based on the type of query (but this is decent)
    selected_docs = []
    for doc, csm in sorted_docs:
        if csm > threshold:
            selected_docs.append(doc)
        else:
            break
    return selected_docs

#### Retrieval Algorithm for Proofs

In [None]:
def get_top_n_distinct(documents, query, n):
    query_embedding = embd.embed_query(query)
    cos_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    doc_with_scores = [(doc, cos_sim[i]) for i, doc in enumerate(documents)]
    sorted_docs = sorted(doc_with_scores, key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in sorted_docs[:n]]

#### Doing This Here Makes Some Other Functions More Efficient

In [None]:
embd = OpenAIEmbeddings()
doc_embeddings = [embd.embed_query(doc.page_content) for doc in splits_leaves]

#### Final ChatBot (Uses Multi-Query + RAG Tree-Based Parser + Routing Example + > Near-Maximum Cosine Similarity + Keeps History of Last 2 Messages)

In [None]:
template = """Given the following mathematical context from the notes:
    {context}
    
    Question To Answer Now: {question}

    Previous Question:Response Pairs in Order from Least Recent to Most Recent (You are a ChatBot, so these are the previous Questions asked of you, and the Responses you gave to each):{q_a}
    
    Mathematical Response:
    [Response here, using formal mathematical notation where appropriate]
    
    Important Notes:
    - All statements are made strictly based on the provided context
    - Any mathematical notation used follows directly from the source material
    - No additional mathematical facts or properties are assumed beyond what's given
    - Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
    - Do not miss out on critical information. Make sure to include this.
    - You may or may not need to incorporate the previous Questions and Responses. Do this when you think it is suitable.
    
    [If any crucial context appears to be missing for a complete answer, note that explicitly]
    """

q_a_list = []

while True:
    distinct_docs = []
    query = input('Query: ')
    q_a = 'No Previous Questions and Responses'
    if len(q_a_list) != 0:
        q_a = '\n' + ''.join(q_a_list)
    query_template = f"""The Current Query is: {query}.
    The previous Questions and Responses in order from Least Recent to Most Recent (You are a helper tool to a Chatbot, so these are the previous Questions asked of it, and the Responses it gave to each):{q_a}
    """
    query_type = classify_math_query(query)
    if query_type == 'other':
        sub_queries = generate_sub_queries(query_template)
        while len(sub_queries) != 3:
            sub_queries = generate_sub_queries(query_template)
        
        for sq in [query] + sub_queries:
            distinct_docs.append(get_sorted_documents_without_neighbors(splits_leaves, sq))
            
        seen_content = set()
        unique_docs = []
        for doc in [item for sublist in distinct_docs for item in sublist]:
            if doc.page_content not in seen_content:
                seen_content.add(doc.page_content)
                unique_docs.append(doc)
    else:
        unique_docs = get_top_n_distinct(splits_leaves, query, 1)
    
    prompt = ChatPromptTemplate.from_template(template)
    llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)
    
    rag_chain = prompt | llm | StrOutputParser()
    
    with open('final.txt', 'w', encoding = 'utf-8') as file:
        q_a = 'No Previous Questions and Responses'
        if len(q_a_list) != 0:
            q_a = '\n' + ''.join(q_a_list)
        response = rag_chain.invoke({"context": unique_docs,"question": query, "q_a": q_a})
        file.write(response)
        if len(q_a_list) < 2:
            q_a_list.append('Question: ' + query + ', Response: ' + response + '\n')
        else:
            q_a_list.pop(0)
            q_a_list.append('Question: ' + query + ', Response: ' + response + '\n')

# If you go to final.txt, your output is there

## Ideas for the Future

# What I Did to Get to The Above Point

## Simple RAG Model

In [None]:
with open('out.txt', 'r', encoding = 'utf-8') as file:
    text = file.read()
doc = Document(page_content = text)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200)

splits = text_splitter.split_documents([doc])

In [None]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings(),
                                    collection_metadata={"hnsw:space": "cosine"})

retriever = vectorstore.as_retriever()

In [None]:
template = """Given the following mathematical context from the notes:
{context}

Question: {question}

Mathematical Response:
[Response here, using formal mathematical notation where appropriate]

Important Notes:
- All statements are made strictly based on the provided context
- Any mathematical notation used follows directly from the source material
- No additional mathematical facts or properties are assumed beyond what's given
- Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
- Do not miss out on critical information. Make sure to include this.

[If any crucial context appears to be missing for a complete answer, note that explicitly]
"""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

query = 'What are the axioms of a field?'
with open('final.txt', 'w', encoding = 'utf-8') as file:
    file.write(rag_chain.invoke(query))

## Simple RAG Model + Math Delimiters

In [None]:
with open('out.txt', 'r', encoding = 'utf-8') as file:
    text = file.read()
doc = Document(page_content = text)

math_delimiters = [
    "\nTheorem", "\nLemma", "\nDefinition", "\nProof",
    "\nExample", "\nProposition", "\nCorollary",
    "\n\n", "\n", " ", ""
]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200,
    separators=math_delimiters
)

splits = text_splitter.split_documents([doc])

In [None]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings(),
                                    collection_metadata={"hnsw:space": "cosine"})

retriever = vectorstore.as_retriever()

In [None]:
template = """Given the following mathematical context from the notes:
{context}

Question: {question}

Mathematical Response:
[Response here, using formal mathematical notation where appropriate]

Important Notes:
- All statements are made strictly based on the provided context
- Any mathematical notation used follows directly from the source material
- No additional mathematical facts or properties are assumed beyond what's given
- Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
- Do not miss out on critical information. Make sure to include this.

[If any crucial context appears to be missing for a complete answer, note that explicitly]
"""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

query = 'Can you define the reals using the decimal expansion definition?'
with open('final.txt', 'w', encoding = 'utf-8') as file:
    file.write(rag_chain.invoke(query))

## Removing Math Delimiters & Forcing Uniqueness (Custom Retrieval)

In [None]:
with open('out.txt', 'r', encoding = 'utf-8') as file:
    text = file.read()
doc = Document(page_content = text)


text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200
)

splits = text_splitter.split_documents([doc])

In [None]:
def get_sorted_documents_with_neighbors(documents, query, num_results=3):
    embd = OpenAIEmbeddings()
    doc_embeddings = [embd.embed_query(doc.page_content) for doc in documents]
    query_embedding = embd.embed_query(query)
    cos_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    doc_with_scores = [(doc, cos_sim[i], i) for i, doc in enumerate(documents)]
    sorted_docs = sorted(doc_with_scores, key=lambda x: x[1], reverse=True)
    top_docs = sorted_docs[:num_results]
    selected_docs = []
    added_indices = set()
    for doc, _, idx in top_docs:
        if idx not in added_indices:
            selected_docs.append(doc)
            added_indices.add(idx)
        if idx > 0 and (idx - 1) not in added_indices:
            selected_docs.append(documents[idx - 1])
            added_indices.add(idx - 1)
        if idx < len(documents) - 1 and (idx + 1) not in added_indices:
            selected_docs.append(documents[idx + 1])
            added_indices.add(idx + 1)
    return selected_docs

query = 'Can you list all definitions in the context?'
distinct_docs = get_sorted_documents_with_neighbors(splits, query, num_results=3)

In [None]:
template = """Given the following mathematical context from the notes:
{context}

Question: {question}

Mathematical Response:
[Response here, using formal mathematical notation where appropriate]

Important Notes:
- All statements are made strictly based on the provided context
- Any mathematical notation used follows directly from the source material
- No additional mathematical facts or properties are assumed beyond what's given
- Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
- Do not miss out on critical information. Make sure to include this.

[If any crucial context appears to be missing for a complete answer, note that explicitly]
"""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

rag_chain = prompt | llm | StrOutputParser()

with open('final.txt', 'w', encoding = 'utf-8') as file:
    response = rag_chain.invoke({"context": distinct_docs,"question": query})
    file.write(response)

## Math Delimiters + >80% Cosine Similarity (Does Not Work, Chunks Too Large -> Dilution)

In [None]:
with open('out.txt', 'r', encoding = 'utf-8') as file:
    text = file.read()
doc = Document(page_content = text)

math_delimiters = [
    "\nTheorem", "\nLemma", "\nDefinition", "\nProof",
    "\nExample", "\nProposition", "\nCorollary",
    "\n\n", "\n", " ", ""
]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200,
    separators=math_delimiters
)

splits = text_splitter.split_documents([doc])

In [None]:
def get_sorted_documents_without_neighbors(documents, query):
    embd = OpenAIEmbeddings()
    doc_embeddings = [embd.embed_query(doc.page_content) for doc in documents]
    query_embedding = embd.embed_query(query)
    cos_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    doc_with_scores = [(doc, cos_sim[i]) for i, doc in enumerate(documents)]
    sorted_docs = sorted(doc_with_scores, key=lambda x: x[1], reverse=True)
    selected_docs = []
    for doc, csm in sorted_docs:
        if csm > 0.8:
            selected_docs.append(doc)
    return selected_docs

query = 'Can you list all definitions in the context?'
distinct_docs = get_sorted_documents_without_neighbors(splits, query)

In [None]:
template = """Given the following mathematical context from the notes:
{context}

Question: {question}

Mathematical Response:
[Response here, using formal mathematical notation where appropriate]

Important Notes:
- All statements are made strictly based on the provided context
- Any mathematical notation used follows directly from the source material
- No additional mathematical facts or properties are assumed beyond what's given
- Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
- Do not miss out on critical information. Make sure to include this.

[If any crucial context appears to be missing for a complete answer, note that explicitly]
"""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

rag_chain = prompt | llm | StrOutputParser()

with open('final.txt', 'w', encoding = 'utf-8') as file:
    response = rag_chain.invoke({"context": distinct_docs,"question": query})
    file.write(response)

## RAG Tree-Based Parser + > Near-Maximum Cosine Similarity

In [None]:
from dataclasses import dataclass
from typing import List, Optional, Dict

class MathNode:
    def __init__(self, content, node_type, identifier, parent=None, children=None):
        self.content = content
        self.node_type = node_type
        self.identifier = identifier
        self.parent = parent
        self.children = children or []

class MathDocumentParser:
    def __init__(self):
        self.patterns = {
            'chapter': r'Chapter\s+(\d+)',
            'section': r'(\d+\.\d+)\s+([^\n]+)',
            'subsection': r'(\d+\.\d+\.\d+)\s+([^\n]+)',
            'theorem': r'Theorem\s+(\d+)',
            'lemma': r'Lemma\s+(\d+)',
            'proposition': r'Proposition\s+(\d+)',
            'definition': r'Definition\s+(\d+)',
            'example': r'Example\s+(\d+)',
            'proof': r'Proof(?:\s+of\s+(?:Theorem|Lemma|Proposition)\s+\d+)?'
        }
        self.root = None

    def parse(self, text: str):
        self.root = MathNode("Document Root", "root", "root")
        current_chapter = None
        current_section = None
        current_subsection = None
        
        lines = text.split('\n')
        current_content = []
        current_node = None
        
        for line in lines:
            matched = False
            chapter_match = re.match(self.patterns['chapter'], line)
            if chapter_match:
                if current_node:
                    current_node.content = '\n'.join(current_content).strip()
                current_chapter = MathNode(
                    content="",
                    node_type="chapter",
                    identifier=f"Chapter {chapter_match.group(1)}",
                    parent=self.root
                )
                self.root.children.append(current_chapter)
                current_node = current_chapter
                current_section = None
                current_subsection = None
                current_content = []
                matched = True
                continue
                
            section_match = re.match(self.patterns['section'], line)
            if section_match and current_chapter:
                if current_node:
                    current_node.content = '\n'.join(current_content).strip()
                current_section = MathNode(
                    content="",
                    node_type="section",
                    identifier=section_match.group(1),
                    parent=current_chapter
                )
                current_chapter.children.append(current_section)
                current_node = current_section
                current_subsection = None
                current_content = []
                matched = True
                continue

            subsection_match = re.match(self.patterns['subsection'], line)
            if subsection_match and current_section:
                if current_node:
                    current_node.content = '\n'.join(current_content).strip()
                current_subsection = MathNode(
                    content="",
                    node_type="subsection",
                    identifier=subsection_match.group(1),
                    parent=current_section
                )
                current_section.children.append(current_subsection)
                current_node = current_subsection
                current_content = []
                matched = True
                continue

            for content_type, pattern in self.patterns.items():
                if content_type in ['chapter', 'section', 'subsection']:
                    continue
                    
                match = re.match(pattern, line)
                if match:
                    if current_node:
                        current_node.content = '\n'.join(current_content).strip()
                    
                    parent = current_subsection if current_subsection else current_section
                    if not parent:
                        continue
                        
                    new_node = MathNode(
                        content="",
                        node_type=content_type,
                        identifier=f"{content_type.capitalize()} {match.group(1)}" if content_type != 'proof' else "Proof",
                        parent=parent
                    )
                    parent.children.append(new_node)
                    current_node = new_node
                    current_content = []
                    matched = True
                    break
            
            if not matched and current_node:
                current_content.append(line)

        if current_node:
            current_node.content = '\n'.join(current_content).strip()

def print_tree(node: MathNode, level: int = 0):
    """Print the tree in a readable format."""
    indent = "  " * level
    print(f"{indent}├── {node.node_type}: {node.identifier}")
    if node.content:
        content_preview = node.content.replace('\n', ' ')[:50]
        if len(node.content) > 50:
            content_preview += "..."
        print(f"{indent}│   Content: {content_preview}")
    for child in node.children:
        print_tree(child, level + 1)

parser = MathDocumentParser()

with open('cleaned_output.txt', 'r', encoding='utf-8') as f:
    content = f.read()

parser.parse(content)
print("\nDocument Structure:")
print_tree(parser.root)

In [None]:
def get_leaf_contents(node):
    leaf_contents = []
    
    def collect_leaves(node_2):
        if not node_2.children: 
            if node_2.content:
                leaf_contents.append(Document(page_content = node_2.content))
        else:
            for child in node_2.children:
                collect_leaves(child)
    
    collect_leaves(node)
    return leaf_contents

splits_leaves = get_leaf_contents(parser.root)

In [None]:
def get_sorted_documents_without_neighbors(documents, query):
    embd = OpenAIEmbeddings()
    doc_embeddings = [embd.embed_query(doc.page_content) for doc in documents]
    query_embedding = embd.embed_query(query)
    cos_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    doc_with_scores = [(doc, cos_sim[i]) for i, doc in enumerate(documents)]
    sorted_docs = sorted(doc_with_scores, key=lambda x: x[1], reverse=True)
    threshold = math.floor(sorted_docs[0][1] / 0.05) * 0.05 # Might be able to answer more complex queries with > 0.05 at a greater cost
                                                            # Or do some routing based on the type of query (but this is decent)
    selected_docs = []
    for doc, csm in sorted_docs:
        if csm > threshold:
            selected_docs.append(doc)
    return selected_docs

query = 'list the axioms of the reals'
distinct_docs = get_sorted_documents_without_neighbors(splits_leaves, query)

In [None]:
template = """Given the following mathematical context from the notes:
{context}

Question: {question}

Mathematical Response:
[Response here, using formal mathematical notation where appropriate]

Important Notes:
- All statements are made strictly based on the provided context
- Any mathematical notation used follows directly from the source material
- No additional mathematical facts or properties are assumed beyond what's given
- Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
- Do not miss out on critical information. Make sure to include this.

[If any crucial context appears to be missing for a complete answer, note that explicitly]
"""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

rag_chain = prompt | llm | StrOutputParser()

with open('final.txt', 'w', encoding = 'utf-8') as file:
    response = rag_chain.invoke({"context": distinct_docs,"question": query})
    file.write(response)

In [None]:
# Just doing it manually for now
def remove_running_headers(input_file, output_file):
    with open(input_file, "r", encoding = 'utf-8') as infile, open(output_file, "w", encoding = 'utf-8') as outfile:
        for line in infile:
            if re.match(r'^\s*CHAPTER\s+\d+', line):
                continue
            if re.match(r'^\s*[A-Z0-9\s\.\-]+$', line.strip()):
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()):
                continue
            outfile.write(line)

remove_running_headers("out_2.txt", "cleaned_output.txt")

## Multi-Query + RAG Tree-Based Parser + > Near-Maximum Cosine Similarity

In [None]:
def generate_sub_queries(query):
    sub_query_prompt = f"""Break down this mathematical query into smaller, related queries that would help build a complete answer.
Query: {query}

Let's generate specific sub-queries to find:
1. The direct definition/axioms
2. Related properties mentioned elsewhere
3. Any additional conditions or properties that combine with these
4. Any special cases or extensions

I need specific sub-queries. For example, given a Query: List the axioms of the reals, you should return something similar to:
What are the field axioms that the reals satisfy?,
What are the ordering properties of the reals?,
What additional properties are mentioned about the real numbers?,
How are the reals defined in terms of fields and ordering?

Do not try to answer the Query at all. Just create sub-queries (so questions that help to build up a larger picture. I'm using you for a RAG tool and this is multi-query)
Also, when you generate sub-queries, do not use your own knowledge AT ALL. Just stick to sub-queries that generalize the query based on knowledge of English and NOT of maths. Remember, this is a RAG tool.
To re-iterate, don't use your maths knowledge AT ALL. 
Just don't use your own knowledge of the Query AT ALL.

Importantly, you are guiding this strategy. So as the master of this RAG tool, you should 3 choose sub-queries that will be able to gather information from disparate parts of the input file.

Just don't use your maths knowledge AT ALL. 
Just don't use your own knowledge of the Query AT ALL.


Generate 3 sub-queries:"""

    # Can probably optimise the prompt naturally
    
    client = OpenAI(
        api_key = os.environ['OPENAI_API_KEY']
    )
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": sub_query_prompt,
            }
        ],
        model="gpt-4-1106-preview",
    )

    qs = chat_completion.choices[0].message.content
    return [q.strip().split('. ', 1)[1] if '. ' in q else q.strip() for q in qs.split('\n')]


In [None]:
embd = OpenAIEmbeddings()
doc_embeddings = [embd.embed_query(doc.page_content) for doc in splits_leaves]

In [None]:
distinct_docs = []

def get_sorted_documents_without_neighbors(documents, query):
    query_embedding = embd.embed_query(query)
    cos_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    doc_with_scores = [(doc, cos_sim[i]) for i, doc in enumerate(documents)]
    sorted_docs = sorted(doc_with_scores, key=lambda x: x[1], reverse=True)
    threshold = math.floor((sorted_docs[0][1] - 0.01) / 0.05) * 0.05 # This works, but can vary it depending on the query
                                                            # Or do some routing based on the type of query (but this is decent)
    selected_docs = []
    for doc, csm in sorted_docs:
        if csm > threshold:
            selected_docs.append(doc)
        else:
            break
    return selected_docs

query = 'prove the square root of 2 is irrational'
sub_queries = generate_sub_queries(query)
while len(sub_queries) != 3:
    sub_queries = generate_sub_queries(query)

for sq in [query] + sub_queries:
    distinct_docs.append(get_sorted_documents_without_neighbors(splits_leaves, sq))
    print("Processed a sub-query")
    
seen_content = set()
unique_docs = []
for doc in [item for sublist in distinct_docs for item in sublist]:
    if doc.page_content not in seen_content:
        seen_content.add(doc.page_content)
        unique_docs.append(doc)

In [None]:
template = """Given the following mathematical context from the notes:
{context}

Question: {question}

Mathematical Response:
[Response here, using formal mathematical notation where appropriate]

Important Notes:
- All statements are made strictly based on the provided context
- Any mathematical notation used follows directly from the source material
- No additional mathematical facts or properties are assumed beyond what's given
- Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
- Do not miss out on critical information. Make sure to include this.

[If any crucial context appears to be missing for a complete answer, note that explicitly]
"""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

rag_chain = prompt | llm | StrOutputParser()

with open('final.txt', 'w', encoding = 'utf-8') as file:
    response = rag_chain.invoke({"context": unique_docs,"question": query})
    file.write(response)

## Multi-Query + RAG Tree-Based Parser + Normal Retriever (Trying to Get Speed Ups - Nevermind)

In [None]:
vectorstore = Chroma.from_documents(documents=splits_leaves, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

In [None]:
sub_query_prompt = """Break down this mathematical query into smaller, related queries that would help build a complete answer.
Question: {question}

Let's generate specific sub-queries to find:
1. The direct definition/axioms
2. Related properties mentioned elsewhere
3. Any additional conditions or properties that combine with these
4. Any special cases or extensions

I need specific sub-queries. For example, given a Query: List the axioms of the reals, you should return something similar to:
What are the field axioms that the reals satisfy?,
What are the ordering properties of the reals?,
What additional properties are mentioned about the real numbers?,
How are the reals defined in terms of fields and ordering?

Do not try to answer the Question at all. Just create sub-queries (so questions that help to build up a larger picture. I'm using you for a RAG tool and this is multi-query)
Also, when you generate sub-queries, do not use your own knowledge AT ALL. Just stick to sub-queries that generalize the query based on knowledge of English and NOT of maths. Remember, this is a RAG tool.
To re-iterate, don't use your maths knowledge AT ALL. 
Make sure to not use your maths knowledge AT ALL. 
Please don't use your maths knowledge AT ALL. 
Just don't use your maths knowledge AT ALL. 
Just don't use your own knowledge of the Question AT ALL.

Importantly, you are guiding this strategy. So as the master of this RAG tool, you should 3 choose sub-queries that will be able to gather information from disparate parts of the input file.

Just don't use your maths knowledge AT ALL. 
Just don't use your own knowledge of the Question AT ALL.


Generate 3 sub-queries:"""

prompt_perspectives = ChatPromptTemplate.from_template(sub_query_prompt)

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
def get_unique_union(documents: list[list]):
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

question = "list every single definition in the context comprehensively"
retrieval_chain = generate_queries | retriever.map() | get_unique_union

In [None]:
template = """Given the following mathematical context from the notes:
{context}

Question: {question}

Mathematical Response:
[Response here, using formal mathematical notation where appropriate]

Important Notes:
- All statements are made strictly based on the provided context
- Any mathematical notation used follows directly from the source material
- No additional mathematical facts or properties are assumed beyond what's given
- Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
- Do not miss out on critical information. Make sure to include this.
- If something is obviously not relevant to the Question, don't include it - if it is OBVIOUSLY not relevant and you can intuitively do without it

[If any crucial context appears to be missing for a complete answer, note that explicitly]
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

with open('final.txt', 'w', encoding = 'utf-8') as file:
    file.write(final_rag_chain.invoke({"question": question}))

## Helper Functions For Example Routing

In [None]:
def classify_math_query(query: str) -> str:
    
    classification_prompt = f"""Determine if this mathematical query is asking for a proof or something else.

    Think step by step:
    1. Is this asking for a proof/demonstration of why something is true?
    2. Or is it asking for something else (definition, explanation, etc.)?
    
    Query: {query}
    
    Return ONLY 'proof' or 'other'."""

    client = OpenAI(
        api_key = os.environ['OPENAI_API_KEY']
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": classification_prompt,
            }
        ],
        model="gpt-4-1106-preview",
    )
    return chat_completion.choices[0].message.content.strip().lower()

def get_top_n_distinct(documents, query, n):
    query_embedding = embd.embed_query(query)
    cos_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    doc_with_scores = [(doc, cos_sim[i]) for i, doc in enumerate(documents)]
    sorted_docs = sorted(doc_with_scores, key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in sorted_docs[:n]]

## Simple ChatBot (Uses Multi-Query + RAG Tree-Based Parser + Routing Example + > Near-Maximum Cosine Similarity + Keeps History of Last 2 Messages)

In [None]:
template = """Given the following mathematical context from the notes:
    {context}
    
    Question To Answer Now: {question}

    Previous Question:Response Pairs in Order from Least Recent to Most Recent (You are a ChatBot, so these are the previous Questions asked of you, and the Responses you gave to each):{q_a}
    
    Mathematical Response:
    [Response here, using formal mathematical notation where appropriate]
    
    Important Notes:
    - All statements are made strictly based on the provided context
    - Any mathematical notation used follows directly from the source material
    - No additional mathematical facts or properties are assumed beyond what's given
    - Where multiple pieces of context overlap, they may be unified into a coherent response while preserving mathematical rigor. Only do this if it is logical and is appropriate/makes sense.
    - Do not miss out on critical information. Make sure to include this.
    - You may or may not need to incorporate the previous Questions and Responses. Do this when you think it is suitable.
    
    [If any crucial context appears to be missing for a complete answer, note that explicitly]
    """

q_a_list = []

while True:
    distinct_docs = []
    query = input('Query: ')
    q_a = 'No Previous Questions and Responses'
    if len(q_a_list) != 0:
        q_a = '\n' + ''.join(q_a_list)
    query_template = f"""The Current Query is: {query}.
    The previous Questions and Responses in order from Least Recent to Most Recent (You are a helper tool to a Chatbot, so these are the previous Questions asked of it, and the Responses it gave to each):{q_a}
    """
    query_type = classify_math_query(query)
    if query_type == 'other':
        sub_queries = generate_sub_queries(query_template)
        while len(sub_queries) != 3:
            sub_queries = generate_sub_queries(query_template)
        
        for sq in [query] + sub_queries:
            distinct_docs.append(get_sorted_documents_without_neighbors(splits_leaves, sq))
            
        seen_content = set()
        unique_docs = []
        for doc in [item for sublist in distinct_docs for item in sublist]:
            if doc.page_content not in seen_content:
                seen_content.add(doc.page_content)
                unique_docs.append(doc)
    else:
        unique_docs = get_top_n_distinct(splits_leaves, query, 1)
    
    prompt = ChatPromptTemplate.from_template(template)
    llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)
    
    rag_chain = prompt | llm | StrOutputParser()
    
    with open('final.txt', 'w', encoding = 'utf-8') as file:
        q_a = 'No Previous Questions and Responses'
        if len(q_a_list) != 0:
            q_a = '\n' + ''.join(q_a_list)
        response = rag_chain.invoke({"context": unique_docs,"question": query, "q_a": q_a})
        file.write(response)
        if len(q_a_list) < 2:
            q_a_list.append('Question: ' + query + ', Response: ' + response + '\n')
        else:
            q_a_list.pop(0)
            q_a_list.append('Question: ' + query + ', Response: ' + response + '\n')