In [1]:
pip install torch transformers langchain faiss-cpu pymupdf pillow scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install langchain-community

Note: you may need to restart the kernel to use updated packages.


In [3]:
import fitz
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Clip Model
import os
from dotenv import load_dotenv
load_dotenv()

#  set up the environment
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

#  initialize the CLIP model for unified embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [9]:
# Embedding functions
def embed_image(image_data):
    """Embed an image using CLIP"""
    if isinstance(image_data, str):
        image = Image.open(image_data).convert("RGB")
    else:
        image = image_data

    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

def embed_text(text):
    """Embed text using CLIP"""
    inputs = clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77 # CLIP max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [30]:
# Process PDF
pdf_path="performance_prediction.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {} # Store image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [11]:
doc

Document('multimodal_sample.pdf')

In [12]:
for i,page in enumerate(doc):
    # Process text
    text = page.get_text()
    if text.strip():
        # create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        # Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

    # Process images
    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            image_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = image_base64

            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata = {"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()
        

In [13]:
all_embeddings

[array([-2.67243292e-03,  1.28300078e-02, -5.18314131e-02,  4.14879359e-02,
        -2.33941767e-02, -7.55864056e-03, -3.67659293e-02,  1.19710699e-01,
         8.52080807e-02,  2.05426570e-03, -1.11534707e-02, -1.29592167e-02,
         5.25014512e-02, -3.65391700e-03,  4.76078540e-02,  1.58372968e-02,
         2.03388296e-02,  4.35362011e-02, -3.29169002e-03,  2.03181449e-02,
         1.88025483e-03, -4.23493870e-02,  5.44100394e-03,  3.70935723e-02,
        -1.65623091e-02,  6.48645870e-03, -4.78012003e-02,  8.67485628e-03,
         5.88859506e-02, -3.21394131e-02,  4.32440154e-02,  9.65301972e-03,
        -4.47924202e-03, -1.94857828e-02, -3.63503024e-02, -1.23471608e-02,
        -2.17929389e-02, -1.99016184e-02,  8.09619799e-02, -3.32986601e-02,
        -2.38901339e-02, -3.96138802e-02, -1.27279945e-02,  3.50380838e-02,
        -2.52217259e-02,  2.00031837e-03,  1.49660185e-02, -2.31976416e-02,
        -6.86791167e-02, -5.25787182e-04, -2.22545844e-02, -1.04104038e-02,
        -1.9

In [14]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')]

In [17]:
# create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.00267243,  0.01283001, -0.05183141, ..., -0.00385086,
         0.02977715, -0.00010682],
       [ 0.01732343, -0.0132769 , -0.0242703 , ...,  0.08994047,
        -0.00272156,  0.03253041]], shape=(2, 512), dtype=float32)

In [18]:
(all_docs, embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')],
 array([[-0.00267243,  0.01283001, -0.05183141, ..., -0.00385086,
          0.02977715, -0.00010682],
        [ 0.01732343, -0.0132769 , -0.0242703 , ...,  0.08994047,
         -0.00272156,  0.03253041]], shape=(2, 512), dtype=float32))

In [20]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings = [(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]
)

vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x1e4ff6e4d00>

In [22]:
# Initialize GPT-4 Vision model
llm = ChatOpenAI(model="gpt-4o", temperature=0)
llm

ChatOpenAI(profile={'max_input_tokens': 128000, 'max_output_tokens': 16384, 'image_inputs': True, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': False, 'tool_calling': True, 'structured_output': True, 'image_url_inputs': True, 'pdf_inputs': True, 'pdf_tool_message': True, 'image_tool_message': True, 'tool_choice': True}, client=<openai.resources.chat.completions.completions.Completions object at 0x000001E4FF5473A0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001E4A5D265C0>, root_client=<openai.OpenAI object at 0x000001E4FF544880>, root_async_client=<openai.AsyncOpenAI object at 0x000001E4A5D26530>, model_name='gpt-4o', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'), stream_usage=True)

In [23]:
def retrieve_multimodal(query, k=5):
    # Unified retrieval using CLIP
    query_embedding = embed_text(query)

    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )

    return results

In [25]:
def create_multimodal_message(query, retrieved_docs):
    # Create a message with both text and images for GPT-4V.

    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Seperate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })

        # Add images
        for doc in image_docs:
            image_id = doc.metadata.get("image_id")
            if image_id and image_id in image_data_store:
                content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_data_store[image_id]}"
                    }

                })

                # Add instructions
                content.append({
                    "type": "text",
                    "text": "\n\nPlease answer the question based on the provided text and images."
                })

                return HumanMessage(content=content)

In [28]:
def multimodal_pdf_rag_pipeline(query):
    # Main pipeline for multimodal RAG

    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs)

    # Get response from GPT-4V
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content
    

In [29]:
if __name__ == "__main__":
    # Test the pipeline
    queries = [
        "What is the main idea of the document?",
        "Summarize the key points of the document.",
        "What visual elements are present in the document?",
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("-" * 70)


Query: What is the main idea of the document?
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: The main idea of the document is to summarize the revenue trends over the first three quarters of the year, highlighting steady growth with the highest increase in Q3. The growth is attributed to new product lines in Q1, effective marketing in Q2, and global expansion in Q3.
----------------------------------------------------------------------

Query: Summarize the key points of the document.
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: The document provides an overview of annual revenue trends across the first three quarters