In [2]:
import fitz  
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

import os
from dotenv import load_dotenv
load_dotenv()

os.environ["Gemini_API_KEY"]=os.getenv("Gemini_API_KEY")

clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 8176.03it/s]


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:

def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If path
        image = Image.open(image_data).convert("RGB")
    else:  
        image = image_data
    
    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)

        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()
    
def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text, 
        return_tensors="pt", 
        padding=True,
        truncation=True,
        max_length=77 
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [5]:

pdf_path="multimodel_sample.pdf"
doc=fitz.open(pdf_path)
all_docs = []
all_embeddings = []
image_data_store = {}  
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [6]:
doc


Document('multimodel_sample.pdf')

In [7]:
for i,page in enumerate(doc):
    text=page.get_text()
    if text.strip():
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)


    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            image_id = f"page_{i}_img_{img_index}"
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
            
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()

The channel dimension is ambiguous. Got image shape (3, 3, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
The channel dimension is ambiguous. Got image shape (3, 3, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
The channel dimension is ambiguous. Got image shape (3, 3, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
The channel dimension is ambiguous. Got image shape (3, 3, 3). A

In [8]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='The DESIGN\nof EVERYDAY\nTHINGS\nDON\nNORMAN\nR E V I S E D  &  E X PA N D E D  E D I T I O N'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_1'}, page_content='[Image: page_0_img_1]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_2'}, page_content='[Image: page_0_img_2]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_3'}, page_content='[Image: page_0_img_3]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_4'}, page_content='[Image: page_0_img_4]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_5'}, page_content='[Image: page_0_img_5]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_6'}, page_content='[Image: page_0_img_6]'),
 Document(metadata={'page': 0

In [9]:
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 2.6462495e-03, -1.3403277e-02,  6.1281621e-03, ...,
        -7.8779878e-05,  6.6685337e-03, -3.6553171e-02],
       [ 2.5418619e-02,  3.2517876e-02, -1.9958112e-02, ...,
         6.5464020e-02,  7.8078038e-03,  9.4257500e-03],
       [ 5.4080281e-03, -4.5190640e-03, -3.7746556e-02, ...,
         5.9468720e-02,  4.2817085e-03, -4.4071637e-03],
       ...,
       [-1.8492134e-02,  1.0361141e-02, -3.8550757e-02, ...,
        -1.0309402e-02, -2.1211780e-03,  9.6145961e-03],
       [-7.3007862e-03,  1.9200046e-02, -2.3069795e-02, ...,
        -9.8516680e-03, -1.2135769e-02,  1.2169234e-02],
       [-3.5051645e-03, -2.6859859e-02,  1.8868445e-03, ...,
         2.4294658e-02, -1.1045860e-02, -7.3617771e-02]],
      shape=(2234, 512), dtype=float32)

In [23]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='The DESIGN\nof EVERYDAY\nTHINGS\nDON\nNORMAN\nR E V I S E D  &  E X PA N D E D  E D I T I O N'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_1'}, page_content='[Image: page_0_img_1]'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_2'}, page_content='[Image: page_0_img_2]'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_3'}, page_content='[Image: page_0_img_3]'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_4'}, page_content='[Image: page_0_img_4]'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_5'}, page_content='[Image: page_0_img_5]'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_6'}, page_content='[Image: page_0_img_6]'),
  Document(metadata={

In [24]:
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x1e7852ba3c0>

In [15]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os
os.environ["GOOGLE_API_KEY"] = "api_key"

# Initialize Gemini model with explicit API key
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",  
    temperature=0.7,
    google_api_key=os.environ["GOOGLE_API_KEY"]  # <--- important!
)

print(llm.invoke("Hello Gemini, are you working now?"))


content='As a large language model, I\'m always "working" in the sense that I\'m available to respond to prompts.  So yes, I\'m currently working and ready to assist you.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []} id='run--d174cfa6-61e1-4038-9954-897106d3eb06-0' usage_metadata={'input_tokens': 8, 'output_tokens': 42, 'total_tokens': 50, 'input_token_details': {'cache_read': 0}}


In [19]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    query_embedding = embed_text(query)
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    return results

In [20]:

def create_multimodal_message(query, retrieved_docs):
    content = []
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    return HumanMessage(content=content)

In [21]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    context_docs = retrieve_multimodal(query, k=5)
    message = create_multimodal_message(query, context_docs)
    response = llm.invoke([message])
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response.content

In [26]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What is happend between  1988 to 2013?",
        "Summarize the main findings The DESIGN of EVERYDAY THINGS",
       
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What is happend between  1988 to 2013?
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 292: cult or even impossible. This is the legacy problem once again: the 
heavy momentum of legacy inhibi...
  - Text from page 24: We have to accept human behavior the way it is, not the way we 
would wish it to be.
  - Text from page 275: opportunity to develop things that assist and enrich the lives of 
people, that bring benefits and e...
  - Text from page 5: For Julie
  - Text from page 122: topic, and you will discover that scientists who work in that area 
are continually disagreeing.


Answer: The provided text excerpts give no information about what happened between 1988 and 2013.  The excerpts contain general statements about human behavior, legacy problems, scientific disagreements, and dedications, but no historical context or events within a specific timeframe.

Query: Summarize the main findings The DESIGN of EVERYDAY THINGS
-------