In [None]:
import os
from dotenv import load_dotenv
import pickle
from typing import Optional, Union, List, Tuple
from pypdf import PdfReader
import textwrap
import requests
import io

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import SystemMessage 
from langchain.embeddings.base import Embeddings
from langchain.schema import Document
from langchain import PromptTemplate

load_dotenv()

In [None]:
def download_pdf(url):
    try:
        with requests.get(url, stream=True, timeout=5) as r:
            r.raise_for_status()
            file = io.BytesIO()
            for chunk in r.iter_content(chunk_size=8192): 
                file.write(chunk)
            file.seek(0)
        return file
    except:
        print("Error downloading file")
        return None

# Download the PDF file if it doesn't exist
url = os.getenv("PDF_URL")
file_name = os.getcwd() + "/pdfs/" + str(os.getenv("PDF_FILE_NAME"))
if not os.path.exists(file_name):
    print("Downloading PDF")
    file = download_pdf(url)
    if file is not None:
        with open(file_name, "wb") as f:
            f.write(file.read())
else:
    print("PDF already downloaded")


In [None]:
def doc_splitter_recursive(full_path: str, chunk_size:int = 1000, chunk_overlap: int = 100) -> List[Document]:
    '''
    Split a PDF file into a list of documents using a recursive character splitter
    '''
    loader = PyPDFLoader(full_path)
    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(doc)

    return docs

In [None]:
def compute_embeddings(docs: List[Document], embeddings: Embeddings, save_location: str) -> FAISS:
    '''
    Compute embeddings for a PDF file and save them to file
    '''
    # Compute embeddings
    docsearch = FAISS.from_documents(docs, embeddings)
    # Save embeddings to file
    with open(save_location, 'wb') as f:
        pickle.dump(docsearch, f)
    print("Embeddings computed and saved to file")
    return docsearch

In [None]:
async def answer_question(question: str, chat: ChatOpenAI, 
                          prompt_template: PromptTemplate,
                          relevant_information: Optional[str] = None, 
                          docsearch: Optional[FAISS] = None, 
                          similar_doc_count: int = 10,
                          verbose: bool = False) -> Tuple[str, List[Document]]:
    '''
    Answer a question using a chatbot and relevant information
    '''
    similar_docs = []
    # Notifying user that given relevant information will be used
    if relevant_information is not None:
        if verbose:
            print("Using provided relevant information instead of embeddings")

    # Use embeddings to find relevant information if relevant information is not provided
    elif docsearch is not None:
        if verbose:
            print("Using provided embeddings to find relevant information")
        similar_docs = await docsearch.asimilarity_search(question, k=similar_doc_count)
        relevant_information = "\n".join([doc.page_content for doc in similar_docs])

    # Relevant information is not provided, then raise error
    if relevant_information is None:
        raise ValueError("No relevant information generated. Either provide relevant information or docsearch (FAISS object)")

    if verbose:
        print(f"Relevant information length: {len(relevant_information)}")

    formatted_prompt = prompt_template.format(question=question, relevant_information=relevant_information)

    answer = chat([SystemMessage(content=formatted_prompt)])
    return answer.content, similar_docs


In [None]:
# MODEL = "gpt-3.5-turbo"
MODEL = "gpt-4"
chat = ChatOpenAI(temperature=0, model_name=MODEL) # type: ignore
embeddings = OpenAIEmbeddings() # type: ignore
pdf_path_page = os.getcwd() + "/pdfs/" + os.getenv("PDF_FILE_NAME") # type: ignore

embeddings_recursive = os.getcwd() + "/embeddings/" + os.getenv("EMBEDDINGS_BY_CHUNK_FILE_NAME") # type: ignore
if os.path.exists(embeddings_recursive):
    print("Embeddings file already exists for recursive splitter")
    # read embeddings object from file
    with open(embeddings_recursive, 'rb') as f:
        docsearch_recursive = pickle.load(f)
else:
    print("Embeddings file does not exist for recursive splitter")
    # compute embeddings
    docs = doc_splitter_recursive(pdf_path_page, chunk_size=1000, chunk_overlap=100)
    docsearch_recursive = compute_embeddings(docs, embeddings, embeddings_recursive)

In [None]:
simple_answer_prompt = PromptTemplate.from_template("""
Answer the question below. Some information is provided. 

Question: {question}

Relevant Information: {relevant_information}
""")

restrictive_answer_prompt = PromptTemplate.from_template("""
You will answer a question based of the given information below. If the relevant information
does not answer the question, then just say "No sure".

Question: {question}

Relevant Information: {relevant_information} 
""")

### Simple example

In this example the the simple and restrictive prompts should result in similar answers, as the provided information is very specific.

In [None]:
QUESTION = "What is the name of the main character in Harry Potter?"
relevant_text = "Harry Potter is the main character of the Harry Potter series."
answer, _ = await answer_question(QUESTION, chat, simple_answer_prompt, relevant_information=relevant_text)
print(f"Answer: {textwrap.fill(answer, 50)}")
answer, _ = await answer_question(QUESTION, chat, restrictive_answer_prompt, relevant_information=relevant_text)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")

### Restrictive vs Simple Prompt

Here we should see that the restrictive prompt will not be able to answer the question as no information is provided about the specific question. Whereas, the simple prompt will give an answer, but founded from the models training data and not from the provided information.

In [None]:
QUESTION = "What is the name of the main character in Harry Potter?"
irrelevant_text = "This is a test."
answer, _ = await answer_question(QUESTION, chat, simple_answer_prompt, relevant_information=irrelevant_text)
print(f"Answer: {textwrap.fill(answer, 50)}")
answer, _ = await answer_question(QUESTION, chat, restrictive_answer_prompt, relevant_information=irrelevant_text)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")

### Example using Restrictive Prompt and Irrelevant or Relevant Information

**Question**: Were the house points used much?

This question refers to the difference between the books and movies. The movies does not use the house point system that much, but in the books it is different.. More details can be found [here](https://harrypotter.fandom.com/wiki/List_of_differences_between_the_Harry_Potter_books_and_films#1._House_points).

**Non-Restrictive Prompt with Irrelevant Information:** 
The answer, in this case, may not be reliable as it lacks evidence or related context.

**Restrictive Prompt with Irrelevant Information:** 
The system should not attempt to answer the question due to the absence of pertinent data.

**Restrictive Prompt with Relevant Information:** 
The system is expected to answer the question accurately due to the provision of context-specific data from a relevant source.



In [None]:
QUESTION = "Were the house points used much?"
irrelevant_text = "This is a test."
print("Non restrictive prompt with irrelevant information")
answer, _ = await answer_question(QUESTION, chat, simple_answer_prompt, relevant_information=irrelevant_text)
print(f"Answer: {textwrap.fill(answer, 50)}")
print("\n\nRestrictive prompt with irrelevant information")
answer, _ = await answer_question(QUESTION, chat, restrictive_answer_prompt, relevant_information=irrelevant_text)
print(f"Answer: {textwrap.fill(answer, 50)}")
print("\n\nNon restrictive prompt with relevant information")
answer, _ = await answer_question(QUESTION, chat, restrictive_answer_prompt, docsearch=docsearch_recursive, similar_doc_count=12, verbose=True)
print(f"Answer: {textwrap.fill(answer, 50)}")

### All vs Some

Here is a cool example of the difference of how you word the question. The first question asks for all whereas, the second question asks for some. The answers are very different. The model should be able to pick up on the fact that it can not reliably give all, but with some relevant information it can definitely give some.

In [None]:
QUESTION = "Can you tell me all the horcruxes from Harry Potter?"
answer, _ = await answer_question(QUESTION, chat, restrictive_answer_prompt, docsearch=docsearch_recursive, similar_doc_count=10)
print(f"Answer: {textwrap.fill(answer, 50)}")
QUESTION = "Can you tell about some of the horcruxes?"
answer, _ = await answer_question(QUESTION, chat, restrictive_answer_prompt, docsearch=docsearch_recursive, similar_doc_count=10)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")

# Importance of Metadata in Document Processing with Generative AI

In the era of generative AI models, the significance of metadata in document processing has been further amplified. Metadata encapsulates essential information about data, making it an invaluable asset for maintaining the integrity and transparency of AI operations. 

## Traceability and Verification

Metadata serves as a roadmap, providing crucial details about the source of the information. For generative AI, which is capable of producing large volumes of text, it is vital to keep track of the original sources. The reasons are multifold:

1. **Avoiding Misinformation**: Metadata can help ensure the AI model doesn't "hallucinate," or generate information that isn't supported by the input data. By keeping track of the source information, we can verify the generated output against the original context.

2. **Double-checking Facts**: For sensitive applications such as scientific research or engineering projects, it's crucial to confirm the accuracy of generated information. Metadata allows users to trace back to the original sources, facilitating the verification process.

3. **Citation and Accountability**: Accurate metadata allows for proper citation of the information's source. This not only fulfills academic and professional standards but also promotes accountability and fairness in knowledge sharing.

## Metadata: The Silent Workhorse

The versatility of metadata allows it to store a wide range of information, from links and page numbers to author names and text sections. However, its significance transcends these particulars. 

In the vast, interconnected data ecosystems where generative AI operates, metadata functions as the connective tissue, bridging the gap between massive data volumes and precise, reliable outcomes. 

In summary, the use of metadata is instrumental in preserving the reliability, traceability, and accountability of generative AI systems. By maintaining a clear line of sight to source information, metadata acts as a robust tool for users to verify facts and ensure the integrity of their work.


In [None]:
def doc_splitter_by_page(file: io.BytesIO, metadata: dict) -> List[Document]:
    '''
    Split a PDF file into a list of documents by page
    '''
    reader = PdfReader(file)
    return [
        Document(
            page_content=page.extract_text(),
            metadata={
                        **metadata,
                        "page": page_number, 
                    },
        )
        for page_number, page in enumerate(reader.pages)
    ]


embeddings_path_page = os.getcwd() + "/embeddings/" + os.getenv("EMBEDDINGS_BY_PAGE_FILE_NAME") # type: ignore
if os.path.exists(embeddings_path_page):
    print("Embeddings file already exists for page splitter")
    # read embeddings object from file
    with open(embeddings_path_page, 'rb') as f:
        docsearch_page = pickle.load(f)
else:
    print("Embeddings file does not exist for page splitter")
    # load file into BytesIO object
    with open(pdf_path_page, 'rb') as f:
        file = io.BytesIO(f.read())
    # compute embeddings
    docs = doc_splitter_by_page(file, { "source": os.getenv("PDF_URL")})
    docsearch_page = compute_embeddings(docs, embeddings, embeddings_path_page)

In [None]:
QUESTION = "can you give me the things that happen to the flying car"
answer, _ = await answer_question(QUESTION, chat, restrictive_answer_prompt, docsearch=docsearch_recursive, similar_doc_count=10)
print(f"Answer: {textwrap.fill(answer, 50)}")
answer, similar_docs = await answer_question(QUESTION, chat, restrictive_answer_prompt, docsearch=docsearch_page, similar_doc_count=10)
print(f"\nAnswer: {textwrap.fill(answer, 50)}")

print("\nSources:")
for doc in similar_docs:
    print(f"Source: {doc.metadata['source']}")
    print(f"Page: {doc.metadata['page']}")