In [None]:
!apt-get install -y poppler-utils tesseract-ocr libmagic-dev

In [None]:
%pip install -Uq "unstructured[all-docs]" pillow lxml pillow
%pip install -Uq chromadb tiktoken
%pip install -Uq langchain langchain-community langchain-openai langchain-groq
%pip install -Uq python_dotenv

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"]=""                 #Set your api key
os.environ["GROQ_API_KEY"] = ""                 #Set your api key
os.environ["LANGCHAIN_API_KEY"] = ""            #Set your api key
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [None]:
from unstructured.partition.pdf import partition_pdf
from google.colab import files

uploaded = files.upload()

file_path = list(uploaded.keys())[0]

chunks=partition_pdf(
    filename=file_path,
    infer_table_structure=True,           # extract Tables
    strategy="hi_res",                    # mandatory to infer tables
    extract_image_block_types=["Image"],  # Add 'Table' to list to extract image of tables
    extract_image_block_to_payload=True,  # If 'True', will extract base64 for API usage
    chunking_strategy="by_title",         # or 'basic'
    max_characters=10000,                 # defaults to 500
    combine_text_under_n_chars=2000,      # defaults to 0
    new_after_n_chars=6000,)

In [None]:
# We get 2 types of elements from the partition_pdf function
set([str(type(el)) for el in chunks])

# Length of chunk
print("Length of chunk-->", len(chunks))

# Each CompositeElement contains a bunch of related elements.
chunks[0].metadata.orig_elements

elements=chunks[0].metadata.orig_elements
chunk_images=[el for el in elements if 'Image' in str(type(el))]
chunk_images[0].to_dict()

In [8]:
tables=[]
texts=[]

for chunk in chunks:
  if "Table" in str(type(chunk)):
    tables.apppend(chunk)

  if "CompositeElement" in str(type(chunk)):
    texts.append(chunk)

In [9]:
def get_images_base64(chunks):
  images_b64=[]
  for chunk in chunks:
    if "CompositeElement" in str(type(chunk)):
      chunk_els=chunk.metadata.orig_elements
      for el in chunk_els:
        if "Image" in str(type(el)):
          images_b64.append(el.metadata.image_base64)
  return images_b64

images=get_images_base64(chunks)

In [None]:
import base64
from IPython.display import Image, display

def display_base64_image(base64_code):
  image_data=base64.b64decode(base64_code)
  display(Image(data=image_data))

display_base64_image(images[0])

In [12]:
%pip install -Uq langchain-groq

In [13]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [14]:
# Prompt
prompt_text = """
You are an expert summarizer. Your task is to summarize tables and texts and provide a concise summary.
**You must only provide the summary and no additional comments such as "Here is your summary..", etc.**
The Table or Text chunk to be summarized is: {element}
"""

prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatGroq(temperature=0.5, model="llama-3.1-8b-instant")
summarize_chain = {"element": lambda x:x} | prompt | model | StrOutputParser()

In [15]:
# Summarize text
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

# Summarize tables
tables_html= [table.metadata.text_as_html for table in tables]
table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})

In [None]:
text_summaries       #To check the text summaries
table_summaries      #To check the table summaries

In [20]:
%pip install -Uq langchain_openai

In [31]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

prompt_template=""" You must describe the image in great detail. For context (use this only if applicable), the image is part of a research paper explaining the implementation of AI and ML techniques for Hydroponics farming. Be specific about graphs and bar plots, if applicable."""

messages=[
    (
        "user",
     [
         {"type":"text", "text": prompt_template},
          {
              "type":"image_url",
              "image_url":{"url": "data:image/jpeg;base64,{image}"},
          },
      ],
      )
]

prompt = ChatPromptTemplate.from_messages(messages)

chain = prompt | ChatOpenAI(model="gpt-4o-mini") | StrOutputParser()

image_summaries = chain.batch(images)

In [None]:
image_summaries   #To check the image summaries

In [None]:
print(image_summaries[4])

In [42]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The vectorstore is used to index child chunks
vectorstore = Chroma(collection_name="multimodal_rag", embedding_function=OpenAIEmbeddings())

# The storage layer for the parent documents
store=InMemoryStore()
id_key="doc_id"

# The retriever (empty to start)
retriever= MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [None]:
print(f"texts: {len(texts)}, text_summaries: {len(text_summaries)}")
print(f"tables: {len(tables)}, table_summaries: {len(table_summaries)}")
print(f"images: {len(images)}, image_summaries: {len(image_summaries)}")

In [45]:
# Add texts
if texts:
    doc_ids = [str(uuid.uuid4()) for _ in texts]
    summary_texts = [
        Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
    ]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables (only if there are tables)
if tables:
    table_ids = [str(uuid.uuid4()) for _ in tables]
    summary_tables = [
        Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
    ]
    retriever.vectorstore.add_documents(summary_tables)
    retriever.docstore.mset(list(zip(table_ids, tables)))

# Add image summaries
if images:
    img_ids = [str(uuid.uuid4()) for _ in images]
    summary_img = [
        Document(page_content=summary, metadata={id_key: img_ids[i]}) for i, summary in enumerate(image_summaries)
    ]
    retriever.vectorstore.add_documents(summary_img)
    retriever.docstore.mset(list(zip(img_ids, images)))


In [46]:
# Retrieve
docs= retriever.invoke("Who are the authors of this paper?")

In [None]:
for doc in docs:
  print(str(doc)+"\n\n"+"-"*80)

In [49]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
from base64 import b64decode

def parse_docs(docs):
  """Split base64-encoded images and texts"""
  b64=[]
  text=[]
  for doc in docs:
    try:
      b64decode(doc)
      b64.append(doc)
    except Exception as e:
      text.append(doc)
  return {"images":b64, "texts":text}

def build_prompt(kwargs):
  docs_by_type=kwargs["context"]
  user_question=kwargs["question"]

  context_text=""
  if len(docs_by_type["texts"])>0:
    for text_element in docs_by_type["texts"]:
      context_text+=text_element.text

  #construct prompt with context (including images)
  prompt_template=f"""
  Answer the question based on the following context, which can include text, tables, and the below image.
  Context: {context_text}
  Question: {user_question}
  """

  prompt_content=[{"type":"text", "text":prompt_template}]

  if len(docs_by_type["images"])>0:
    for image in docs_by_type["images"]:
      prompt_content.append(
          {
              "type":"image_url",
              "image_url": {"url": f"data:image/jpeg;base64,{image}"},
          }
      )

  return ChatPromptTemplate.from_messages(
      [
          HumanMessage(content=prompt_content),
      ]
  )

chain = (
    {
        "context": retriever | RunnableLambda(parse_docs),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt)
    | ChatOpenAI(model="gpt-4o-mini")
    | StrOutputParser()
)

chain_with_sources = {
    "context": retriever | RunnableLambda(parse_docs),
    "question": RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=(
        RunnableLambda(build_prompt)
        | ChatOpenAI(model="gpt-4o-mini")
        | StrOutputParser()
    )
)


In [None]:
response = chain.invoke(
    "What is the hydroponics farming?"
)

print(response)

In [None]:
response = chain_with_sources.invoke(
    "How is KNN used for hydroponics?"
)

print("Response:", response['response'])

print("\n\nContext:")
for text in response['context']['texts']:
    print(text.text)
    print("Page number: ", text.metadata.page_number)
    print("\n" + "-"*50 + "\n")
for image in response['context']['images']:
    display_base64_image(image)