In [1]:
from pprint import pprint
from os import listdir
from os.path import isfile, join
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.routers import FileTypeRouter

from haystack.components.routers import ConditionalRouter
from haystack.components.websearch.serper_dev import SerperDevWebSearch

# from haystack.components.embedders import SentenceTransformersTextEmbedder
# from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
# from haystack.components.builders.answer_builder import AnswerBuilder


# load env vars
from dotenv import load_dotenv
assert load_dotenv() == True 

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [34]:
class Retrieval_System():
    def __init__(self, doc_dir):
        self.doc_dir = doc_dir
        self.__setup()
        
    def __setup(self):
        self.__retrieve_doc_paths()
        self.__set_prompts()
        self.__set_document_store()
        self.__set_router()
        self.__set_preprocess_pipeline()
        self.__set_rag_pipeline()
        
    def __retrieve_doc_paths(self):
        self.doc_paths = [self.doc_dir+f for f in listdir(self.doc_dir) if isfile(join(self.doc_dir, f))]
        
    def __set_prompts(self):
        self.prompt = """
        Answer the following query given the documents.
        Your answer should explicitly state that your answer was generated from database documents.
        If the answer is not contained within the documents reply exclusively with 'no_answer'
        
        Query: {{query}}
        Documents:
        {% for document in documents %}
        {{document.content}}
        {% endfor %}
        """
        # self.prompt = """
        # Answer the following query given the documents.
        # Your answer should explicitly state that your answer was generated from database documents.
        # If the answer is not contained within the documents reply exclusively with 'no_answer'
        
        # Query: {{query}}
        # Documents:
        # {% for document in documents %}
        # {{document.content}}
        # {% endfor %}
        
        # \nQuestion: {{query}}
        # \nAnswer:
        # """
            
        self.prompt_builder = PromptBuilder(template=self.prompt)
        self.llm = OpenAIGenerator(model="gpt-3.5-turbo")
        
        self.prompt_for_websearch = """
        Answer the following query given the documents retrieved from the web.
        Explicitly state in your answer that your answer was was generated using online sources.

        Query: {{query}}
        Documents:
        {% for document in documents %}
        {{document.content}}
        {% endfor %}
    
        """
        
        
        # self.prompt_for_websearch = """
        # Answer the following query given the documents retrieved from the web.
        # Explicitly state in your answer that your answer was was generated using online sources.

        # Query: {{query}}
        # Documents:
        # {% for document in documents %}
        # {{document.content}}
        # {% endfor %}
        
        # \nQuestion: {{query}}
        # \nAnswer:
        # """

        self.prompt_builder_for_websearch = PromptBuilder(template=self.prompt_for_websearch)
        
        self.websearch = SerperDevWebSearch()
        self.llm_for_websearch = OpenAIGenerator(model="gpt-3.5-turbo")
        
    def __set_router(self):
        routes = [
            {
                "condition": "{{'no_answer' in replies[0]}}",
                "output": "{{query}}",
                "output_name": "go_to_websearch",
                "output_type": str,
            },
            {
                "condition": "{{'no_answer' not in replies[0]}}",
                "output": "{{replies[0]}}",
                "output_name": "answer",
                "output_type": str,
            },
        ]
        self.router = ConditionalRouter(routes)

            
    def __set_document_store(self):
        self.document_store = InMemoryDocumentStore()
        self.file_type_router = FileTypeRouter(mime_types=["application/pdf"])

        self.pdf_converter = PyPDFToDocument()
        self.document_cleaner = DocumentCleaner()
        self.document_splitter = DocumentSplitter(
            split_by = "passage",
            split_length = 500,
            split_overlap = 50
        ) 
        self.document_writer = DocumentWriter(self.document_store)
    
    def __set_preprocess_pipeline(self):
        self.preprocessing_pipeline = Pipeline()
        self.preprocessing_pipeline.add_component(instance=self.file_type_router, name="file_type_router")
        self.preprocessing_pipeline.add_component(instance=self.pdf_converter, name="pypdf_converter")
        self.preprocessing_pipeline.add_component(instance=self.document_cleaner, name="document_cleaner")
        self.preprocessing_pipeline.add_component(instance=self.document_splitter, name="document_splitter")
        self.preprocessing_pipeline.add_component(instance=self.document_writer, name="document_writer")

        self.preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
        self.preprocessing_pipeline.connect("pypdf_converter", "document_cleaner")
        self.preprocessing_pipeline.connect("document_cleaner", "document_splitter")
        self.preprocessing_pipeline.connect("document_splitter", "document_writer")
        
        self.preprocessing_pipeline.run({"file_type_router": {"sources":self.doc_paths}})
        
    def __set_rag_pipeline(self):
        self.pipeline = Pipeline()
        
        self.pipeline.add_component(instance=InMemoryBM25Retriever(document_store=self.document_store), name="retriever")
        
        self.pipeline.add_component(instance=self.llm, name="llm")
        self.pipeline.add_component(instance=self.router, name="router")
        self.pipeline.add_component(instance=self.websearch, name="websearch")
        self.pipeline.add_component(instance=self.prompt_builder, name="prompt_builder")
        self.pipeline.add_component(instance=self.prompt_builder_for_websearch, name="prompt_builder_for_websearch")
        self.pipeline.add_component(instance=self.llm_for_websearch, name="llm_for_websearch")

        # datastore and retriever
        # llm and prompt for RAG
        self.pipeline.connect("prompt_builder", "llm")
        self.pipeline.connect("llm.replies", "router.replies")
        # llm and prompt for websearch
        self.pipeline.connect("router.go_to_websearch", "websearch.query")
        self.pipeline.connect("router.go_to_websearch", "prompt_builder_for_websearch.query")
        self.pipeline.connect("websearch.documents", "prompt_builder_for_websearch.documents")
        self.pipeline.connect("prompt_builder_for_websearch", "llm_for_websearch")
        
        # aggregate all responses for final output 
        # self.pipeline.connect("llm.replies", "answer_builder.replies")
        # self.pipeline.connect("llm_for_websearch", "answer_builder.replies")
        
        # self.pipeline.connect("llm.replies", "joiner")
        # self.pipeline.connect("llm_for_websearch.replies", "joiner")
        # self.pipeline.connect("joiner", "answer_builder.replies")
        # self.pipeline.connect("llm.meta", "answer_builder.meta")
        # self.pipeline.connect("retriever", "answer_builder.documents")

        
        
    def run(self, query):
        results = self.pipeline.run(
        {
        "retriever": {"query": query},
        "prompt_builder": {"query": query},
        #"prompt_builder_for_websearch": {"query": query}, # already sent by router
        # "answer_builder": {"query": query}, 
        "router": {"query": query}
        }
        )
        
        if 'router' in results.keys():
            #return results
            return results['router']['answer']
        #return results
        return results['llm_for_websearch']['replies']
        

In [35]:
pipe = Retrieval_System(doc_dir = "../PKD_PDFs/")

In [36]:
# response should be taken from RAG
query = "What is autosomal dominate PKD?"
response = pipe.run(query)
pprint(response)

['Based on the information retrieved from online sources, autosomal dominant '
 'PKD (polycystic kidney disease) is a genetic disorder where individuals can '
 'inherit the disease-causing genetic variation from only one parent. This '
 'means that if one parent has the disease, each child will have a 50 percent '
 'chance of inheriting it. ADPKD is characterized by the development of '
 'fluid-filled cysts within the kidneys and is considered one of the most '
 'common and life-threatening genetic diseases. It is a progressive disorder '
 'that can lead to various complications due to the cyst formation in multiple '
 'organs.']


In [37]:
# response should be taken from RAG
query = "What are the symptoms of autosomal dominant PKD?"
response = pipe.run(query)
pprint(response)

('From database documents, the symptoms of autosomal dominant PKD are:\n'
 '- High blood pressure\n'
 '- Back or side pain\n'
 '- Headache\n'
 '- Frequent urination\n'
 '- Blood in urine\n'
 '- Kidney stones\n'
 '- Pain or tenderness in the abdomen')


In [38]:
# response should be taken from RAG
query = "Is exercise recommended for people with PKD?"
response = pipe.run(query)
pprint(response)

('Answer: Yes, exercise is recommended for people with PKD according to '
 'document 3.')


In [39]:
# response should be web based
query = "What is the most famous building in the United States?"
response = pipe.run(query)
pprint(response)

['Based on the documents retrieved from online sources, the most famous '
 'building in the United States is the Empire State Building in New York City, '
 'NY. It is considered one of the most famous buildings in the world and is '
 'the flagship of the New York skyline. Other notable mentions include the '
 "White House in Washington, D.C., which is also considered one of America's "
 'favorites, and the Gateway Arch in St. Louis, Montana.']


In [40]:
# response should be web based
query = "What will the weather in Glendale, California be on July 4th, 2024?"
response = pipe.run(query)
pprint(response)

['Based on the information provided in the documents retrieved from the web, '
 'the weather in Glendale, California on July 4th, 2024 is expected to be '
 'mostly clear with a high of around 84-86°F. The average daily high '
 'temperature in Glendale during July is typically around 84-86°F, and it is '
 'mentioned that there is a low chance of overcast or mostly cloudy weather. \n'
 '\n'
 'This answer was generated using online sources.']
