In [None]:
# import library
%pip install -Uq "unstructured[all-docs]" 
%pip install -Uq langchain_chroma 
%pip install -Uq langchain langchain-community langchain-openai 
%pip install -Uq python_dotenv

In [None]:
import json
from typing import List

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# LangChain components
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()

In [None]:
file_path = "./"

def partition_pdf_elements(file_path):
    elements = partition_pdf(
            filename=file_path,  # Path to your PDF file
            strategy="hi_res", # Use the most accurate (but slower) processing method of extraction
            infer_table_structure=True, # Keep tables as structured HTML, not jumbled text
            extract_image_block_types=["Image"], # Grab images found in the PDF
            extract_image_block_to_payload=True # Store images as base64 data you can actually use
        )
    print(f"Found {len(elements)} elements")
    return elements

elements = partition_pdf_elements(file_path)

In [None]:
# All types of different atomic elements we see from unstructured library
set([str(type(el)) for el in elements])

In [None]:
# check content in array
elements[0].to_dict()

In [None]:
# collect e.g : img in elements
images = [element for element in elements if element.category == 'Image']
print(f"Found {len(images)} images")

images[0].to_dict()
# Use https://codebeautify.org/base64-to-image-converter to view the base64 text

In [None]:
def create_chunks_title(elements):
    chunks = chunk_by_title(
        elements, # The parsed PDF elements from previous step
        max_characters=3000, # Hard limit - never exceed 3000 characters per chunk
        new_after_n_chars=2400, # Try to start a new chunk after 2400 characters
        combine_text_under_n_chars=500 # Merge tiny chunks under 500 chars with neighbors
    )   

    print(f"Make {len(chunks)} chunks")
    return chunks

chunks = create_chunks_title(elements)

In [None]:
# All unique chunk types
set([str(type(chunk)) for chunk in chunks])

In [None]:
# View a single chunk
# chunks[2].to_dict()

# View original elements
chunks[11].metadata.orig_elements[-1].to_dict()

In [None]:
def separate_content_types(chunk):
    content_data = {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types': ['text']
    }

    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__
            
            # Handle tables
            if element_type == 'Table':
                content_data['types'].append('table')
                table_html = getattr(element.metadata, 'text_as_html', element.text)
                content_data['tables'].append(table_html)
            
            # Handle images
            elif element_type == 'Image':
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                    content_data['types'].append('image')
                    content_data['images'].append(element.metadata.image_base64)
    
    content_data['types'] = list(set(content_data['types']))
    return content_data

separate_content_types(chunks)

In [None]:
def create_ai_summary(text: str, tables: List[str], images: List[str]):
    llm = ChatOpenAI(model="gpt-4o", temperature=0)
    
    # Build the text prompt
    prompt_text = f"""You are creating a searchable description for document content retrieval.

    CONTENT TO ANALYZE:
    TEXT CONTENT:
    {text}

    """
    
    # Add tables if present
    if tables:
        prompt_text += "TABLES:\n"
        for i, table in enumerate(tables):
            prompt_text += f"Table {i+1}:\n{table}\n\n"
    
            prompt_text += """
            YOUR TASK:
            Generate a comprehensive, searchable description that covers:

            1. Key facts, numbers, and data points from text and tables
            2. Main topics and concepts discussed  
            3. Questions this content could answer
            4. Visual content analysis (charts, diagrams, patterns in images)
            5. Alternative search terms users might use

            Make it detailed and searchable - prioritize findability over brevity.

            SEARCHABLE DESCRIPTION:"""

    # Build message content starting with text
    message_content = [{"type": "text", "text": prompt_text}]
    
    # Add images to the message
    for image_base64 in images:
        message_content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
        })
    
    # Send to AI and get response
    message = HumanMessage(content=message_content)
    response = llm.invoke([message])
    
    return response.content

In [None]:
def create_summary(chunks):
    langchain_documents = []

    for chunk in chunks:
        content = separate_content_types(chunk)
        summary = create_ai_summary(content['text'], content['tables'], content['images'])

        doc = Document(
            page_content=summary,
            metadata={
                "original_content": json.dumps({
                    "raw_text": summary['text'],
                    "tables_html": summary['tables'],
                    "images_base64": summary['images']
                })
            }
        )

        langchain_documents.append(doc)
    return langchain_documents

proccesed_chunks = create_summary(chunks)

In [None]:
# others
def export_chunks_to_json(chunks, output="chunks.json"):
    export_data = []
    for i, doc in enumerate(chunks):
        chunk_data = {
            "chunk_id": i + 1,
            "enhanced_content": doc.page_content,
            "metadata": {
                "original_content": json.loads(doc.metadata.get("original_content", "{}"))
            } 
        }
        export_data.append(chunk_data)
        
    with open(output, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)
    
    print(f"Exported {len(export_data)} chunks to {output}")
    return export_data
    
json_data = export_chunks_to_json(proccesed_chunks)

In [None]:
def create_vector_store(chunks, directory="db/chroma_db"):

    # embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    embedding_model = HuggingFaceEndpointEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

    # docs = [Document(page_content=text) for text in docs]
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        persist_directory=directory,
        collection_metadata={"hnsw:space": "cosine"} # ?
    )

    return vector_store

db = create_vector_store(proccesed_chunks)

In [None]:
# Example retrieval - chunks
query = "What are the two main components of the Transformer architecture? "
retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)
# export_chunks_to_json(chunks, "rag_results.json")

In [None]:
# def complete_ingestion_pipeline(file_path):
#     elements = partition_pdf_elements(file_path)
#     chunks = create_chunks_title(elements)
#     proccesed_chunks = create_summary(chunks)
#     db = create_vector_store(proccesed_chunks)
#     return db

# db = complete_ingestion_pipeline(file_path)

In [None]:
# simple query - answer

query = "What are the two main components of the Transformer architecture? "
retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)

def generate_answer(chunks):
    llm = ChatOpenAI(model="gpt-4o", temperature=0)
    
    # Build the text prompt
    prompt_text = f"""Based on the following documents, please answer this question: {query}

    CONTENT TO ANALYZE:
    """
        
    for i, chunk in enumerate(chunks):
        prompt_text += f"--- Document {i+1} ---\n"
        
        if "original_content" in chunk.metadata:
            original_data = json.loads(chunk.metadata["original_content"])
            
            # Add raw text
            raw_text = original_data.get("raw_text", "")
            if raw_text:
                prompt_text += f"TEXT:\n{raw_text}\n\n"
            
            # Add tables as HTML
            tables_html = original_data.get("tables_html", [])
            if tables_html:
                prompt_text += "TABLES:\n"
                for j, table in enumerate(tables_html):
                    prompt_text += f"Table {j+1}:\n{table}\n\n"
        
        prompt_text += "\n"
    
    prompt_text += """
        Please provide a clear, comprehensive answer using the text, tables, and images above. If the documents don't contain sufficient information to answer the question, say "I don't have enough information to answer that question based on the provided documents."

        ANSWER:"""

    # Build message content starting with text
    message_content = [{"type": "text", "text": prompt_text}]
    
    # Add all images from all chunks
    for chunk in chunks:
        if "original_content" in chunk.metadata:
            original_data = json.loads(chunk.metadata["original_content"])
            images_base64 = original_data.get("images_base64", [])
            
            for image_base64 in images_base64:
                message_content.append({
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
                })
    
    # Send to AI and get response
    message = HumanMessage(content=message_content)
    response = llm.invoke([message])
    
    return response.content

final_answer = generate_answer(chunks, query)
print(final_answer)