In [2]:
import os
from dotenv import load_dotenv
import pickle
from typing import Optional, Union
import textwrap
import requests
import io

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import SystemMessage 
from langchain.embeddings.base import Embeddings

load_dotenv()

True

In [3]:
def download_pdf(url):
    try:
        with requests.get(url, stream=True, timeout=5) as r:
            r.raise_for_status()
            file = io.BytesIO()
            for chunk in r.iter_content(chunk_size=8192): 
                file.write(chunk)
            file.seek(0)
        return file
    except:
        print("Error downloading file")
        return None

# Download the PDF file if it doesn't exist
url = os.getenv("PDF_URL")
file_name = os.getcwd() + "/pdfs/" + str(os.getenv("PDF_FILE_NAME")) + ".pdf"
if not os.path.exists(file_name):
    print("Downloading PDF")
    file = download_pdf(url)
    if file is not None:
        with open(file_name, "wb") as f:
            f.write(file.read())
else:
    print("PDF already downloaded")


PDF already downloaded


In [4]:
def compute_embeddings(file_name: str, embeddings: Embeddings, chunk_size:int = 1000, chunk_overlap: int = 100) -> FAISS:
    '''
    Compute embeddings for a PDF file and save them to file
    '''
    loader = PyPDFLoader(os.getcwd() + "/pdfs/" + file_name + ".pdf")
    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(doc)

    print(f"Number of documents: {len(docs)}")

    # Compute embeddings
    docsearch = FAISS.from_documents(docs, embeddings)
    # Save embeddings to file
    full_path_embeddings = os.getcwd() + "/embeddings/" + file_name + ".pkl"
    with open(full_path_embeddings, 'wb') as f:
        pickle.dump(docsearch, f)
    print("Embeddings computed and saved to file")
    return docsearch

def load_saved_embeddings(file_name: str) -> Union[FAISS, None]:
    '''
    Load embeddings from file if they exist, 
    otherwise return None
    '''
    full_path_embeddings = os.getcwd() + "/embeddings/" + file_name + ".pkl"
    if os.path.exists(full_path_embeddings):
        try:
            with open(full_path_embeddings, 'rb') as f:
                docsearch = pickle.load(f)
            print("Loaded embeddings from file")
            return docsearch
        except:
            print("Failed to load embeddings from file")
            return None
    else:
        print("No embeddings file found.")
        return None


# Compute or load embeddings
embeddings = OpenAIEmbeddings() # type: ignore
docsearch = load_saved_embeddings("harry_potter")
if docsearch is None:
    docsearch = compute_embeddings("harry_potter", embeddings, chunk_size=1000, chunk_overlap=100)


Loaded embeddings from file


In [5]:
async def answer_question(question: str, chat: ChatOpenAI, 
                          relevant_information: Optional[str] = None, 
                          docsearch: Optional[FAISS] = None, 
                          similar_doc_count: int = 10) -> str:
    
    # Notifying user that given relevant information will be used
    if relevant_information is not None:
        print("Using provided relevant information instead of embeddings")

    # Use embeddings to find relevant information if relevant information is not provided
    elif docsearch is not None:
        print("Using provided embeddings to find relevant information")
        similar_docs = await docsearch.asimilarity_search(question, k=similar_doc_count)
        relevant_information = "\n".join([doc.page_content for doc in similar_docs])

    # Relevant information is not provided, then raise error
    if relevant_information is None:
        raise ValueError("No relevant information generated. Either provide relevant information or docsearch (FAISS object)")

    print(f"Relevant information length: {len(relevant_information)}")

    PROMPT = f"""
You will answer a question based of the given information below. If the relevant information
does not answer the question, then answer only Can not answer from provided text.

Question: {question}

Relevant Information: {relevant_information} 
"""

    answer = chat([SystemMessage(content=PROMPT)])
    return answer.content


In [6]:
# MODEL = "gpt-3.5-turbo"
MODEL = "gpt-4"
chat = ChatOpenAI(temperature=0, model_name=MODEL) # type: ignore

In [39]:
QUESTION = "What is the name of the main character in Harry Potter?"
relevant_text = "Harry Potter is the main character of the Harry Potter series."
answer = await answer_question(QUESTION, chat, relevant_information=relevant_text, docsearch=docsearch, similar_doc_count=10)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")

Using provided relevant information instead of embeddings
Relevant information length: 62


Answer: The name of the main character in Harry Potter is
Harry Potter.


In [40]:
QUESTION = "What is the name of the main character in Harry Potter?"
irrelevant_text = "This is a test."
answer = await answer_question(QUESTION, chat, relevant_information=irrelevant_text, docsearch=docsearch, similar_doc_count=10)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")

Using provided relevant information instead of embeddings
Relevant information length: 15


Answer: Cannot answer from provided text.


In [43]:
QUESTION = "Can you tell me all the horcruxes from Harry Potter?"
answer = await answer_question(QUESTION, chat, docsearch=docsearch, similar_doc_count=10)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")


Using provided embeddings to find relevant information
Relevant information length: 8615


Answer: I cannot answer from the provided text.


In [44]:
QUESTION = "Can you tell about some of the horcruxes?"
answer = await answer_question(QUESTION, chat, docsearch=docsearch, similar_doc_count=10)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")

Using provided embeddings to find relevant information
Relevant information length: 9625


Answer: Some of the Horcruxes include Tom Riddle's diary,
which was proof that he was the Heir of Slytherin,
a locket that belonged to Hufflepuff, and a ring.
It is also believed that Voldemort may have used
the snake, Nagini, as a Horcrux. Dumbledore
destroyed the ring, and Harry destroyed the diary.
There are a total of six Horcruxes, with the
seventh part of Voldemort's soul residing in his
regenerated body.


In [45]:
QUESTION = "can you give me all the things that happen to the flying car"
answer = await answer_question(QUESTION, chat, docsearch=docsearch, similar_doc_count=10)
print(f"\n\nAnswer: {textwrap.fill(answer, 50)}")

Using provided embeddings to find relevant information
Relevant information length: 8451


Answer: Some things that happen to the flying car include:
skimming the sea of fluffy clouds, traveling past
swirls and turrets of snowy clouds, making regular
checks on the train as they flew farther north,
having a faulty Invisibility Booster, losing
altitude and nearly crashing into a castle wall,
and eventually stopping suddenly at the edge of a
forest.
