[PDF / Images / Tables] → Embeddings → Vector DB → RAG model → Answer


## LLM

In [1]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os

load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    api_key=groq_api_key,
    max_tokens=500
)

  from .autonotebook import tqdm as notebook_tqdm


## Document Loader

In [5]:
# import fitz
# doc = fitz.open("example.pdf")
# for page in doc:
#     print(page.get_text())

# from langchain_community.document_loaders import UnstructuredPDFLoader
# loader = UnstructuredPDFLoader("Multimodel_rag/startupai-financial-report-v2.pdf")
# docs = loader.load()
# docs

! pip install pytesseract

sudo apt install poppler-utils

sudo apt install tesseract-ocr

In [2]:
%pwd

'/home/ahmed/project/GenAI-Repro/research'

In [3]:
from typing import Any
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract
import os

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "figures")

# Get elements
raw_pdf_elements = partition_pdf(
    filename=os.path.join(input_path, "image_table_columns.pdf"),
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=output_path,
)



The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [4]:
raw_pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x7bdf83b680b0>]

In [5]:
for raw in raw_pdf_elements:
    print(raw)

>

FINANCIAL

STATEMENT

Explore our financial performance through balance sheets, income, and cash flow statements.

DELAITTE

StartupAI boasts an impressive return on investment (ROI), demonstrating its financial acumen and ability to generate substantial profits for its stakeholders through strategic decisions and operational excellence.

GROSS INCOME 22,000,000 $ TOTAL EXPENSES 2.000,000 $ TAXES 5,000.000 $ NET INCOME 15,000,000 $

33% RO!

StartupAI has achieved a remarkable $22 million in sales, showcasing its market dominance and strong customer appeal.

25 20 15 10 5 0 2020 2021 2022 2023 2024

www.startupAI.com

+123-456-7890


In [6]:
import base64

def encode_image(image_path):
    with open(image_path,'rb') as image_file:
        ret = base64.b64encode(image_file.read()).decode("utf-8")

        return ret


text_elements = []
table_elements = []
image_elements = []

## text and table data extrack
for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    
    elif 'Table' in str(type(element)):
        table_elements.append(element)

## image data extarck
for image_file in os.listdir(output_path):
    if image_file.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(output_path,image_file)
        encoded_image = encode_image(image_path=image_path)
        image_elements.append(encoded_image)

text = [i.text for i in text_elements]
table = [i.text for i in table_elements]

# Tables
print("number of table elements in the pdf file: ", len(table_elements))

# Text
print("number of text elements in the pdf file: ", len(text_elements))

# Imahe
print("number of image elements in the pdf file: ", len(image_elements))


number of table elements in the pdf file:  0
number of text elements in the pdf file:  1
number of image elements in the pdf file:  8


In [7]:
text_elements

[<unstructured.documents.elements.CompositeElement at 0x7bdf83b680b0>]

In [8]:
text = [i.text for i in text_elements]

text

['>\n\nFINANCIAL\n\nSTATEMENT\n\nExplore our financial performance through balance sheets, income, and cash flow statements.\n\nDELAITTE\n\nStartupAI boasts an impressive return on investment (ROI), demonstrating its financial acumen and ability to generate substantial profits for its stakeholders through strategic decisions and operational excellence.\n\nGROSS INCOME 22,000,000 $ TOTAL EXPENSES 2.000,000 $ TAXES 5,000.000 $ NET INCOME 15,000,000 $\n\n33% RO!\n\nStartupAI has achieved a remarkable $22 million in sales, showcasing its market dominance and strong customer appeal.\n\n25 20 15 10 5 0 2020 2021 2022 2023 2024\n\nwww.startupAI.com\n\n+123-456-7890']

In [9]:
table

[]

## prompt and llm summarize 

In [14]:
from langchain_classic.schema.messages import HumanMessage,AIMessage


def text_summerize(text_element):
    prompt = f"Summarize the following text:\n\n{text_element}\n\nSummary:"
    response = llm.invoke([HumanMessage(content=prompt)])

    return response.content


def table_summerize(table_element):
    prompt = f"Summarize the following table:\n\n{table_element}\n\nSummary:"
    response = llm.invoke([HumanMessage(content=prompt)])

    return response.content

def image_summarize(encoded_image):
    message = HumanMessage(
        content=[
            {"type": "text", "text": "Describe the contents of this image in simple words."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                }
            }
        ]
    )

    response = llm.invoke([message])
    return response.content

def image_to_text_then_groq(ocr_text):
    prompt = f"""
    The following text was extracted from an image.
    Summarize it clearly:

    {ocr_text}
    """
    response = llm.invoke(prompt)
    return response.content


response = text_summerize(text_element=text)
response

# response = table_summerize(table_element=table)
# response

# response = image_to_text_then_groq(ocr_text=text)
# response

'StartupAI has reported a strong financial performance. \n\n- The company generated $22 million in gross income.\n- Total expenses were $2 million.\n- Taxes were $5 million, resulting in a net income of $15 million.\n- The return on investment (ROI) was 33% ($15 million net income out of $45 million total costs).\n- The company demonstrated its market dominance and strong customer appeal through its impressive sales figures.'

In [15]:
response = image_to_text_then_groq(ocr_text=text)
response

"Here's a clear summary of the text:\n\n**StartupAI's Financial Performance**\n\n* **Revenue**: $22,000,000 (2024)\n* **Total Expenses**: $20,000,000 (2024)\n* **Taxes**: $5,000,000 (2024)\n* **Net Income**: $15,000,000 (2024)\n* **Return on Investment (ROI)**: 33% (impressive)\n\n**Key Achievements**\n\n* Achieved remarkable sales of $22 million in 2024\n* Demonstrated market dominance and strong customer appeal\n* Boasts an impressive return on investment (ROI)"

## AzureAIDocumentIntelligenceLoader

In [None]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
import pandas as pd

file_path = "<filepath>"
endpoint = "<endpoint>"
key = "<key>"
analysis_features = ["ocrHighResolution"]
loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint,
    api_key=key,
    file_path=file_path,
    api_model="prebuilt-layout",
    analysis_features=analysis_features,
)

# PDF load করা
docs = loader.load()
print(f"Total pages loaded: {len(docs)}")

# ----------------------------
# 3️⃣ Text Chunking
# ----------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,     # প্রতি chunk max 1000 characters
    chunk_overlap=100    # overlap 100 characters
)

chunks = text_splitter.split_documents(docs)
print(f"Total text chunks: {len(chunks)}")

# Example: প্রথম 2 chunks দেখানো
for i, chunk in enumerate(chunks[:2]):
    print(f"\n--- Chunk {i+1} ---")
    print(chunk.page_content[:300])

# ----------------------------
# 4️⃣ Tables Process (optional)
# ----------------------------
# Azure Form Recognizer detect করে tables
# ধরুন প্রথম page-এর table আছে:
for doc in docs:
    if doc.metadata.get("tables"):
        print("\nFound table(s) on page:", doc.metadata.get("page_number"))
        for table in doc.metadata["tables"]:
            df = pd.DataFrame(table["cells"])  # cells কে DataFrame এ convert
            print(df.head())  # table preview

# ----------------------------
# 5️⃣ Images Process (optional)
# ----------------------------
# Azure metadata তে image info থাকে
for doc in docs:
    if doc.metadata.get("images"):
        print(f"\nImages found on page {doc.metadata['page_number']}:")
        for img in doc.metadata["images"]:
            print("Image info:", img)  # path / coordinates / type

# ----------------------------
# 6️⃣ Ready for embeddings / retrieval
# ----------------------------
# chunks বা structured docs এখন vector DB বা QA এর জন্য ready
