In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import uuid
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from unstructured.partition.pdf import partition_pdf
from transformers import pipeline
from langchain_core.runnables import RunnablePassthrough


In [None]:
filepath = input("Enter the path to the file:\n")

In [None]:
raw_pdf_elements=partition_pdf(
    filename=filepath,
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters= 3000,
    new_after_n_chars= 2500,
    combine_text_under_n_chars= 1500,
	 infer_table_structure= True ,
    extract_images_in_pdf= False,                   
    )

In [None]:
table=[]
for element in raw_pdf_elements:
  if "unstructured.documents.elements.Table" in str(type(element)):
            table.append(str(element))

text=[]
for element in raw_pdf_elements:
  if "unstructured.documents.elements.NarrativeText" in str(type(element)):
            text.append(str(element))


In [1]:
prompt_text = """You are an AI assistant tasked with generating summaries of tables for retrieval.\
    These summaries will be embedded and will be used to retrieve the table elements at a later time.\
    Give a concise summary of the table that is well optimized for retrieval.\
    Table:{element} """

prompt= ChatPromptTemplate.from_template(prompt_text)

In [None]:
llm=pipeline(task='Text Generation', model="meta-llama/Meta-Llama-3.1-8B-Instruct")

In [None]:
summarize_chain={"element": lambda x:x} | prompt | llm | StrOutputParser()

In [None]:
table_content = [elements.text for elements in table]
table_summaries = summarize_chain.batch(table_content, {'max_concurrency': 5})

In [None]:
text_content = [elements.text for elements in text]
text_summaries = summarize_chain.batch(text_content, {'max_concurrency': 5})

In [None]:

embeddings = HuggingFaceEmbeddings(model_name = "BAAI/bge-large-en-v1.5")
vector_store = Chroma(collection_name="Summaries",
                      embedding_function=embeddings)

store = InMemoryStore()
id_key = "doc_id" 

retriever = MultiVectorRetriever(vector_store=vector_store, 
                                 doc_store=store, 
                                 id_key=id_key)

text_ids=[str(uuid.uuid4()) for _ in text_content]
summary_text = [Document(page_content = content, metadata={id_key: text_ids[i]}) for i, content in enumerate(text_summaries)]
retriever.vectorstore.add_documents(summary_text)
retriever.docstore.mset(list(zip(text_ids, text_content)))

table_ids=[str(uuid.uuid4()) for _ in table_content]
summary_table = [Document(page_content = content, metadata={id_key: table_ids[i]}) for i, content in enumerate(table_summaries)]
retriever.vectorstore.add_documents(summary_table)
retriever.docstore.mset(list(zip(table_ids, table_content)))

In [None]:
template = """Answer the question based on the following context. Also remember the history of previous answers, so if you are asked about anything regarding the previous chat, you can use that information to answer the question as well.
Context: {context}
Question: {question}"""

prompt = ChatPromptTemplate.from_template(template=template)

chain = (
{'context': retriever, 'question': RunnablePassthrough()} | prompt | llm | StrOutputParser()
)

In [None]:
while True:
        query = input("\n\nAsk your question (Type 'exit' to exit...): \n")
        if query.lower() == "exit" :
            print("\n-------------------------\nThank you for using the service!!")
            exit()

        response = chain.invoke(query)
        print(f"\n\nResponse: {response}")