In [1]:
import json
import os
import shutil
import tempfile

#import packages needed for openai llm
# import openai
from langchain.llms import OpenAI
from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

In [2]:
#load our huggingface information
with open('keys/hf.json', 'r') as rd_f:
    data = json.load(rd_f)
    
os.environ['HUGGINGFACEHUB_API_TOKEN'] = data['HUGGINGFACEHUB_API_TOKEN']

Load PDF into Pages

a) Read one pdf file

In [3]:
#path to pdf file
temp_file_path = 'pdfs/asthma.pdf'

#add pdf to loader object
loader = PyPDFLoader(temp_file_path)
#read pages of loader
pages = loader.load() #use page[i].page_content to access page text if needed
print(f"Number of Pages Read: {len(pages)}")

Number of Pages Read: 2


In [4]:
print(f"Metadata: {pages[0].metadata}")
print('-'*50)
print(f"Page Content: {pages[0].page_content[:150]}")

Metadata: {'source': 'pdfs/asthma.pdf', 'page': 0}
--------------------------------------------------
Page Content: www.thoracic.org
CLIP AND COPYATS Patient Education  | Information SeriesAsthma is a chronic disease that affects the airways of your lungs. 
Your air


b) Read multiple pdf files

In [5]:
#location of folder with multiple pdfs
pdf_fold_loc = 'pdfs'

#load all pdf files in directory
loader = PyPDFDirectoryLoader(pdf_fold_loc)

In [6]:
#pead each page of pdfs
docs = loader.load()
print(f"Total pages of all pdfs: {len(docs)}")

#extract the source file name from docs
docs_read = set([doc.metadata['source'] for doc in docs])
print(f"Number of PDF files: {len(docs_read)}")
print(f"Files: {docs_read}")

Total pages of all pdfs: 2
Number of PDF files: 1
Files: {'pdfs\\asthma.pdf'}


In [7]:
print(f"Metadata: {docs[0].metadata}")
print('-'*50)
print(f"Page Content: {docs[0].page_content[:150]}")

Metadata: {'source': 'pdfs\\asthma.pdf', 'page': 0}
--------------------------------------------------
Page Content: www.thoracic.org
CLIP AND COPYATS Patient Education  | Information SeriesAsthma is a chronic disease that affects the airways of your lungs. 
Your air


Split Pages into Text Chunks

In [8]:
#set chunk and overlap size
chunk_size = 850
chunk_overlap = 150
#create a text splitter using double new lines, then new lines, then periods, and so on if needed
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                               chunk_overlap=chunk_overlap,
                                               separators=['\n\n', '\n', '(?<=\. )', ' ', ''],
                                               length_function=len)

#split pages into chunks using our text splitter
chunks = text_splitter.split_documents(docs)
print(f"Number of Text Chunks: {len(chunks)}")

Number of Text Chunks: 13


Adding or updating metadata in your text chunks (this can be applied to the pages or docs variable as well)

In [9]:
#update metadata for each chunk of text from splitter
_ = [chunks[i].metadata.update({'chunk_num':i}) for i in range(len(chunks))]

In [10]:
chunk_num = 10
print(f"Metadata: {chunks[chunk_num].metadata}")
print('-'*50)
print(f"Page Content: {chunks[chunk_num].page_content[:150]}")

Metadata: {'source': 'pdfs\\asthma.pdf', 'page': 1, 'chunk_num': 10}
--------------------------------------------------
Page Content: and note how they may affect you: 
 aStress—Talk to your healthcare provider about things that cause you stress and learn stress relieving 
techniques


Load Model and Embeddings

In [11]:
#get huggingface embeddings
embeddings = HuggingFaceEmbeddings()

#load llm model and embeddings
llm = HuggingFaceHub(repo_id='google/flan-t5-large',
                 model_kwargs={"temperature":0.5, "max_length":512})

Create Document Store in Memory

In [12]:
#create chroma document store
document_store = Chroma.from_documents(documents=chunks,
                                       embedding=embeddings) 
#persist_directory lets us save locally rather than storing in memory if desired

Search Document Store for Relevant Documents

In [13]:
#find similar documents to our input from document store
document_store.similarity_search_with_score('What is asthma?', k=2)

[(Document(page_content='www.thoracic.org\nATS Patient Education  | Information SeriesAllergies \nAllergies are reactions of your immune system as it \nresponds to things in the environment (allergens) that \noften do not cause most people harm. Having allergies \ncan run in families (be inherited). You may have any or \nall of these reactions with exposure to allergens: \n askin rashes (eczema or hives) \n anose and sinus problems (rhinitis) \n aeye irritation (conjunctivitis) \n aasthma symptoms \n asevere reaction (anaphylaxis) \nSymptoms of nasal allergies include sneezing, itching, \nrunny nose, postnasal drip into the back of your \nthroat, or nasal congestion. If you have nasal allergy \nproblems that are not well controlled, this can also \nworsen asthma control. If you have allergies, you may \nbe more likely to have asthma. Common allergens \ninclude:', metadata={'source': 'pdfs\\asthma.pdf', 'page': 1, 'chunk_num': 7}),
  0.782264232635498),
 (Document(page_content='part 2, 

Search and Use Maximal Marginal Relevance

In [14]:
#use standard similarity search
[doc.metadata for doc in document_store.similarity_search('What is asthma?', k=2, fetch_k=4)]

[{'source': 'pdfs\\asthma.pdf', 'page': 1, 'chunk_num': 7},
 {'source': 'pdfs\\asthma.pdf', 'page': 0, 'chunk_num': 1}]

In [15]:
#use maximal marginal relevence search (notice how results are a little different)
[doc.metadata for doc in document_store.max_marginal_relevance_search('What is asthma?', k=2, fetch_k=4)]

[{'source': 'pdfs\\asthma.pdf', 'page': 1, 'chunk_num': 7},
 {'source': 'pdfs\\asthma.pdf', 'page': 0, 'chunk_num': 0}]

Create Chain for Question-Answer System with Memory

In [16]:
#create a retriever object for the document index using mmr
retriever = document_store.as_retriever(search_type="mmr", search_kwargs={"k":5, "fetch_k":10})

In [17]:
#create memory object
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
#create conversation system with source document retrieval and our prompt
qa_chain = ConversationalRetrievalChain.from_llm(llm=llm,
                                                 retriever=retriever,
                                                 memory=memory)

In [18]:
#ask question using pipeline chain
results = qa_chain({'question': 'What is asthma?'})
print(f"Question: {results['question']}\nAnswer: {results['answer']}")

Question: What is asthma?
Answer: a chronic disease that affects the airways of your lungs


In [19]:
#ask second question in same memory instance using chain
results = qa_chain({'question': 'What causes it?'})
print(f"Question: {results['question']}\nAnswer: {results['answer']}")

Question: What causes it?
Answer: Your airways can get irritated easily when exposed to a variety of things, called “triggers.”


Chat History

In [20]:
#notice how our chat history has kept both questions from the conversation
results['chat_history']

[HumanMessage(content='What is asthma?', additional_kwargs={}, example=False),
 AIMessage(content='a chronic disease that affects the airways of your lungs', additional_kwargs={}, example=False),
 HumanMessage(content='What causes it?', additional_kwargs={}, example=False),
 AIMessage(content='Your airways can get irritated easily when exposed to a variety of things, called “triggers.”', additional_kwargs={}, example=False)]

Question-Answering with Document Retrieval

In [23]:
#load llm model again
llm = HuggingFaceHub(repo_id='google/flan-t5-large',
                 model_kwargs={"temperature":0.5, "max_length":512})

In [25]:
from langchain.prompts import PromptTemplate

#create a prompt template for the model to follow
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say you don't know, don't try to make up an answer. Respond in complete sentences and act a form of chatbot that is answering the users questions.

{context}

Question: {question}
Answer:"""
#put prompt into template object for model toolkit
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=['context', 'question'])
#chain kwargs input
chain_type_kwargs = {"prompt": PROMPT}

In [26]:
#create chain using retrieval qa to allow for return of source documents
qa_system = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                        retriever=retriever,
                                        return_source_documents=True, 
                                        chain_type_kwargs=chain_type_kwargs)

In [27]:
#notice how retrieval qa uses query as required input parameter
results = qa_system({"query": "What is asthma?"})

#print output
print(f"Answer: {results['result']}")
print('Source Documents:\n')
for doc in results['source_documents']:
    print(f"Page: {doc.metadata['page']}")
    print(doc.page_content)
    print('*'*50)
    print('\n')

Answer: a chronic disease that affects the airways of your lungs
Source Documents:

Page: 1
www.thoracic.org
ATS Patient Education  | Information SeriesAllergies 
Allergies are reactions of your immune system as it 
responds to things in the environment (allergens) that 
often do not cause most people harm. Having allergies 
can run in families (be inherited). You may have any or 
all of these reactions with exposure to allergens: 
 askin rashes (eczema or hives) 
 anose and sinus problems (rhinitis) 
 aeye irritation (conjunctivitis) 
 aasthma symptoms 
 asevere reaction (anaphylaxis) 
Symptoms of nasal allergies include sneezing, itching, 
runny nose, postnasal drip into the back of your 
throat, or nasal congestion. If you have nasal allergy 
problems that are not well controlled, this can also 
worsen asthma control. If you have allergies, you may 
be more likely to have asthma. Common allergens 
include:
**************************************************


Page: 0
What triggers as