Download PDF

In [None]:
import urllib
import requests

#download pdf from url to local drive
def download_file(download_url):
    response = urllib.request.urlopen(download_url)
    file = open("LordofTheRings.pdf", 'wb')
    file.write(response.read())
    file.close()
    print("Completed")

download_file("https://gosafir.com/mag/wp-content/uploads/2019/12/Tolkien-J.-The-lord-of-the-rings-HarperCollins-ebooks-2010.pdf")

Load text from pdf

In [None]:
import PyPDF2
from cleantext import clean
import re

def read_pdf(pdf_location):
    # creating a pdf reader object
    reader = PyPDF2.PdfReader(pdf_location)

    # print the number of pages in pdf file
    print(f"Total number of pages: {len(reader.pages)}")

    pages = []
    for i in range(len(reader.pages)):
        page = reader.pages[i].extract_text()
        if page != '':
            #replace two or more new lines with just one
            page = re.sub("\n{2,}","\n", page)
            #replace two or more spaces with just one
            page = re.sub("\{2,}","\n", page)
            #clean ascii text and remove any special characters
            page = clean(page, fix_unicode=True, to_ascii=True, lower=False, no_line_breaks=False,
                         no_urls=False, no_emails=False, no_phone_numbers=False, no_numbers=False, 
                         no_digits=False, no_currency_symbols=False, no_punct=False, lang="en")
            #append page and page number to a list of tuples
            pages.append((i, page))
    return pages

In [None]:
pages = read_pdf('LordofTheRings.pdf')

Save text to .txt files

In [None]:
if not os.path.exists("faiss_documents"):
    os.mkdir("faiss_documents")

for page, text in pages:
    try:
        with open(f"./faiss_documents/Page{page}", 'w') as wt_f:
            wt_f.write(text)
    except:
        print(f"Cannot write page: {page}")

Download models from huggingface

In [None]:
# !git clone https://huggingface.co/sentence-transformers/all-mpnet-base-v2
# !git clone https://huggingface.co/VMware/tinyroberta-mrqa

Create Faiss Document Store

In [1]:
from haystack.document_stores import FAISSDocumentStore
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, EmbeddingRetriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader
import torch

gpu_use = torch.cuda.is_available()
print(f"GPU Available: {gpu_use}")

GPU Available: False


In [2]:
def add_to_faiss_index(index_location, file_location):
    #load components of pipeline, text_converter, preprocessor, and document_store
    indexing_pipeline = Pipeline()
    text_converter = TextConverter()
    preprocessor = PreProcessor(clean_whitespace=True,
                                clean_header_footer=True,
                                clean_empty_lines=True,
                                split_by="word",
                                split_length=200,
                                split_overlap=20,
                                split_respect_sentence_boundary=True)
    
    #if document store exists load and append, if it doesn't exist create a new one
    if not os.path.exists(index_location):
        document_store = FAISSDocumentStore()
        exists = False
    else:
        document_store = FAISSDocumentStore.load(index_path=index_location)
        exists = True
    
    #add nodes to pipeline
    indexing_pipeline.add_node(component=text_converter, name='TextConverter', inputs=['File'])
    indexing_pipeline.add_node(component=preprocessor, name='PreProcessor', inputs=['TextConverter'])
    indexing_pipeline.add_node(component=document_store, name='DocumentStore', inputs=['PreProcessor'])
    
    upload_files = [os.path.join(file_location, fil) for fil in os.listdir(file_location)]
    #run pages through pipeline
    indexing_pipeline.run_batch(file_paths=upload_files)
    
    #add a retriever to transform sentence to vectors
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model='all-mpnet-base-v2',
                                   model_format='sentence_transformers')
    
    #append to existing index or add documents and create index
    if exists == False:
        document_store.update_embeddings(retriever=retriever)
        document_store.save(index_location)
    else:
        document_store.update_embeddings(retriever=retriever, update_existing_embeddings=False)
        document_store.save(index_location)

In [None]:
add_to_faiss_index('LoTR_Faiss_Index', './faiss_documents')

Convert Huggingface Model to FARM format

In [None]:
from haystack.modeling.model.adaptive_model import AdaptiveModel
from haystack.modeling.data_handler.processor import Processor

if not os.path.exists("tinyroberta_local_farm"):
    #convert the hugging face model to the format needed for FARM (only need to do once)
    model = AdaptiveModel.convert_from_transformers("tinyroberta-mrqa",
                                                    device="cpu",
                                                    task_type="question_answering")
    processor = Processor.convert_from_transformers("tinyroberta-mrqa",
                                                    task_type="question_answering",
                                                    max_seq_len=384,
                                                    doc_stride=128)
    model.save("tinyroberta_local_farm")
    processor.save("tinyroberta_local_farm")

Answer Questions from Index

In [3]:
def query_faiss_index(index_location, user_query):
    document_store = FAISSDocumentStore.load(index_path=index_location)
    
    #add reader using tinyroberta
    reader = FARMReader(model_name_or_path='tinyroberta_local_farm',
                        use_gpu=gpu_use, context_window_size=300)
    
    #retriever using the same embedding model as when index is created
    retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model='all-mpnet-base-v2',
                               model_format='sentence_transformers')
    
    #add reader and retriever to question-answer pipeline
    pipeline = ExtractiveQAPipeline(reader, retriever)
    
    prediction = pipeline.run(query=user_query,
                              params={
                                  "Retriever": {"top_k": 10},
                                  "Reader": {"top_k": 5}
                              })
    results = []
    for answer in prediction['answers']:
        results.append((answer.answer, answer.context, answer.score))
    return results

In [4]:
def print_answer(results):
    print(f"Answer: {results[0][0]}")
    print(f"Score: {results[0][2]}")
    print(f"Context: {results[0][1]}")
    print(f"-"*56 + "\n")

In [5]:
#send user query to question-answer pipeline
results = query_faiss_index('LoTR_Faiss_Index', """What was Samwise Gamgee's favorite vegetable?""")
print_answer(results)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Answer: potatoes,
Score: 0.8338145017623901
Context: 
polite to him, calling him 'Master Hamfast', and consulting him
constantly upon the growing of vegetables - in the matter of 'roots',
especially potatoes, the Gaffer was recognized as the leading authority
by all in the neighbourhood (including himself ).
'But what about this Frodo that lives with 
--------------------------------------------------------



In [8]:
#send user query to question-answer pipeline
results = query_faiss_index('LoTR_Faiss_Index', """How many rings were there?""")
print_answer(results)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]

Answer: Seven
Score: 0.30906951427459717
Context: llowers.
Years afterwards Thro' r, now old, poor, and desperate, gave to his son Thra'in
the one great treasure he still possessed, the last of the Seven Rings, and
then he went away with one old companion only, called Na'r. Of the Ring
he said to Thra'in at their parting:
'This may prove the founda
--------------------------------------------------------

