<a href="https://colab.research.google.com/github/Divyaanshvats/Open-Insights---DS---Gen.AI---Task/blob/main/OPEN_INSIGHTS_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **INSTALLATION OF ALL THE REQUIRED LIBRARIES**

In [None]:
!pip install torch
!pip install chromadb
!pip install numpy
!pip install pymupdf
!pip install requests
!pip install Pillow
!pip install transformers



# **LOADING THE PDF AND EXTRACTING ITS CONTENT**

In [None]:
import fitz  # PyMuPDF
import os
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        all_text.append(text)

    return all_text


In [None]:
if __name__ == "__main__":
    pdf_path='/content/NIPS-2017-attention-is-all-you-need-Paper.pdf'
    pages = extract_text_from_pdf(pdf_path)

    print(f"Total pages: {len(pages)}\n")

    for i, page_text in enumerate(pages):
        print(f"--- Page {i+1} ---")
        print(page_text[:100])  # print first 100 chars
        print("\n")


Total pages: 11

--- Page 1 ---
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai


--- Page 2 ---
Recurrent models typically factor computation along the symbol positions of the input and output
seq


--- Page 3 ---
Figure 1: The Transformer - model architecture.
wise fully connected feed-forward network. We employ


--- Page 4 ---
Scaled Dot-Product Attention
Multi-Head Attention
Figure 2: (left) Scaled Dot-Product Attention. (ri


--- Page 5 ---
MultiHead(Q, K, V ) = Concat(head1, ..., headh)W O
where headi = Attention(QW Q
i , KW K
i , V W V
i


--- Page 6 ---
Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations
for 


--- Page 7 ---
the input sequence centered around the respective output position. This would increase the maximum
p


--- Page 8 ---
Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the
En


--- Page 9 ---
Table 3: Variations on t

# **THIS TIME FOR IMAGES**

In [None]:
import fitz  # PyMuPDF
from PIL import Image
import io

def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_pil = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            images.append({
                "page": page_num + 1,
                "image": img_pil,
                "name": f"page{page_num+1}_img{img_index+1}"
            })

    return images


In [None]:
pdf_path = "/content/NIPS-2017-attention-is-all-you-need-Paper.pdf"
images = extract_images_from_pdf(pdf_path)

print(f"Extracted {len(images)} images")
images[0]["image"].show()

Extracted 3 images


# **LOADING MODEL FROM HUGGING FACE(EMBEDDING MODEL)**

In [None]:
from huggingface_hub import login

login(token="MY_ORIGINAL_TOKEN")

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np

class ClipEmbedder:
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = CLIPModel.from_pretrained(model_name).to(self.device)
        self.processor = CLIPProcessor.from_pretrained(model_name)

    def encode_text(self, texts):
        inputs = self.processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(self.device)
        with torch.no_grad():
            outputs = self.model.get_text_features(**inputs)
        return outputs.cpu().numpy()

    def encode_images(self, images):
        inputs = self.processor(images=images, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.get_image_features(**inputs)
        return outputs.cpu().numpy()

In [None]:
embedder = ClipEmbedder()
image_embeddings = embedder.encode_images([img["image"] for img in images])  # your 'images' list
print(image_embeddings.shape)

(3, 512)


# **Import ChromaDB and Create and Populate the Vector Store**

In [None]:
!pip install flash_attn#highly optimized attention mechanism designed for use in Transformer



# **LETS USE OUR EMBEDDING [openai/clip-vit-base-patch32]**

In [None]:
import fitz
import os

def extract_images_from_pdf(pdf_path, image_dir="extracted_images"):
    # Create folder if it doesn't exist
    os.makedirs(image_dir, exist_ok=True)

    doc = fitz.open(pdf_path)
    image_count = 0

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images(full=True)

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page{page_num+1}_img{img_index+1}.{image_ext}"

            # Save image to folder
            with open(os.path.join(image_dir, image_filename), "wb") as img_file:
                img_file.write(image_bytes)

            image_count += 1

    print(f"✅ Extracted {image_count} images from {pdf_path} into '{image_dir}'")
    return image_dir


In [None]:
if __name__ == "__main__":
    pdf_path = "/content/NIPS-2017-attention-is-all-you-need-Paper.pdf"
    img_folder = extract_images_from_pdf(pdf_path)


✅ Extracted 3 images from /content/NIPS-2017-attention-is-all-you-need-Paper.pdf into 'extracted_images'


In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np

# Load CLIP model once (for both text and image embeddings)
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def extract_and_embed_images(pdf_path):
    doc = fitz.open(pdf_path)
    image_embeddings = []
    metadata = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images(full=True)

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Load as PIL image
            pil_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create embedding
            inputs = clip_processor(images=pil_img, return_tensors="pt").to(device)
            with torch.no_grad():
                emb = clip_model.get_image_features(**inputs)
            emb = emb.cpu().numpy().flatten()

            image_embeddings.append(emb)
            metadata.append({
                "page": page_num + 1,
                "type": "image",
                "description": f"Image {img_index+1} from page {page_num+1}"
            })

    print(f"✅ Embedded {len(image_embeddings)} images from {pdf_path}")
    return np.array(image_embeddings), metadata


In [None]:
if __name__ == "__main__":
    pdf_path = "/content/NIPS-2017-attention-is-all-you-need-Paper.pdf"
    img_embeddings, img_metadata = extract_and_embed_images(pdf_path)


✅ Embedded 3 images from /content/NIPS-2017-attention-is-all-you-need-Paper.pdf


In [None]:
def embed_text_chunks(text_pages, chunk_size=20):
    """
    text_pages: list of strings (one per page)
    chunk_size: approx. number of words per chunk for embedding
    """
    text_embeddings = []
    metadata = []

    for page_num, page_text in enumerate(text_pages):
        words = page_text.split()

        # Split into chunks
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size]).strip()
            if not chunk:
                continue

            # Embed the text chunk
            inputs = clip_processor(text=[chunk], return_tensors="pt", padding=True).to(device)
            with torch.no_grad():
                emb = clip_model.get_text_features(**inputs)
            emb = emb.cpu().numpy().flatten()

            text_embeddings.append(emb)
            metadata.append({
                "page": page_num + 1,
                "type": "text",
                "content": chunk
            })

    print(f"✅ Embedded {len(text_embeddings)} text chunks from {len(text_pages)} pages")
    return np.array(text_embeddings), metadata

In [None]:
if __name__ == "__main__":
    pdf_path = "/content/NIPS-2017-attention-is-all-you-need-Paper.pdf"

    # 1. Extract text
    pages = extract_text_from_pdf(pdf_path)

    # 2. Embed text chunks
    text_embeddings, text_metadata = embed_text_chunks(pages)

✅ Embedded 255 text chunks from 11 pages


In [None]:
import chromadb
from chromadb.utils import embedding_functions

def store_embeddings_in_chroma(text_embeddings, text_metadata, img_embeddings, img_metadata, collection_name="attention_rag"):
    client = chromadb.Client()

    # Create or get collection
    collection = client.get_or_create_collection(name=collection_name)

    # Add text embeddings
    for idx, (emb, meta) in enumerate(zip(text_embeddings, text_metadata)):
        collection.add(
            ids=[f"text_{idx}"],
            embeddings=[emb.tolist()],
            metadatas=[meta],
            documents=[meta["content"]]
        )

    # Add image embeddings
    for idx, (emb, meta) in enumerate(zip(img_embeddings, img_metadata)):
        collection.add(
            ids=[f"image_{idx}"],
            embeddings=[emb.tolist()],
            metadatas=[meta],
            documents=[meta["description"]]
        )

    print(f"✅ Stored {len(text_embeddings)} text chunks and {len(img_embeddings)} images in Chroma collection '{collection_name}'")
    return collection

In [None]:
collection = store_embeddings_in_chroma(text_embeddings, text_metadata, img_embeddings, img_metadata)

✅ Stored 255 text chunks and 3 images in Chroma collection 'attention_rag'


In [None]:
def retrieve(query, collection, top_k=5):
    # Embed query
    inputs = clip_processor(text=[query], return_tensors="pt").to(device)
    with torch.no_grad():
        query_emb = clip_model.get_text_features(**inputs).cpu().numpy().flatten()

    results = collection.query(
        query_embeddings=[query_emb.tolist()],
        n_results=top_k
    )
    return results

# **NOW LETS RUN THE 1ST QUERY FROM OUR DATABASE**

In [None]:
query = "How does multi-head attention work?"
results = retrieve(query, collection, top_k=5)

for i in range(len(results["documents"][0])):
    print(f"\nResult {i+1}:")
    print("Document:", results["documents"][0][i])
    print("Metadata:", results["metadatas"][0][i])


Result 1:
Document: the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described
Metadata: {'type': 'text', 'page': 2, 'content': 'the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described'}

Result 2:
Document: heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of
Metadata: {'content': 'heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of', 'page': 7, 'type': 'text'}

Result 3:
Document: to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.
Metadata: {'content': 'to jointly attend to information from different representation subspaces at different position

In [None]:
from PIL import Image
import io
import base64

def retrieve_with_visuals(query, collection, top_k=5):
    # Embed query
    inputs = clip_processor(text=[query], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        query_emb = clip_model.get_text_features(**inputs).cpu().numpy().flatten()

    # Search in Chroma
    results = collection.query(
        query_embeddings=[query_emb.tolist()],
        n_results=top_k
    )

    # Display results
    for i in range(len(results["documents"][0])):
        print(f"\nResult {i+1}:")
        print("Metadata:", results["metadatas"][0][i])

        if results["metadatas"][0][i].get("type") == "image":
            # Decode base64 back to image
            img_data = base64.b64decode(results["documents"][0][i])
            img = Image.open(io.BytesIO(img_data))
            display(img)
        else:
            print("Text:", results["documents"][0][i])

    return results

# **THIS WILL BE OUR 2ND QUERY**

In [None]:
query = "scaled dot-product attention diagram"
results = retrieve_with_visuals(query, collection, top_k=5)


Result 1:
Metadata: {'content': 'query with the corresponding key. 3.2.1 Scaled Dot-Product Attention We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The', 'page': 3, 'type': 'text'}
Text: query with the corresponding key. 3.2.1 Scaled Dot-Product Attention We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The

Result 2:
Metadata: {'content': 'attention [2], and dot-product (multi- plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of', 'type': 'text', 'page': 4}
Text: attention [2], and dot-product (multi- plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of

Result 3:
Metadata: {'page': 5, 'content': 'property. We implement this inside of scaled dot-product attention by masking out (setting to −∞) all values in the input', 'type': 'text'}
Text: property. We implement this inside of scaled dot-product attention by masking o

In [None]:
try:
    print(type(model))
    print("Model device:", next(model.parameters()).device)
except NameError:
    print("Model not loaded yet.")

<class 'transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGeneration'>
Model device: cpu


In [None]:
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print("\n---- Retrieved Segment ----")
    print(doc)
    print("Metadata:", meta)

    if meta.get("type") == "image":
        # Process image segment
        image = None
        if "image_path" in meta:
            try:
                image = Image.open(meta["image_path"]).convert("RGB")
                blip2_output = generate_with_blip2(
                    prompt=f"Explain this diagram in relation to: {query}",
                    image=image
                )
                print("\nBLIP2 Output (Image):", blip2_output)
            except FileNotFoundError:
                 print(f"Warning: Image file not found at {meta['image_path']}. Skipping image processing.")
                 print("\nCould not process image.")
        else:
             print("\nImage metadata missing 'image_path'. Cannot process image.")

    else:
        # Process text segment - simply print the context
        print("\nRetrieved Text Context:", doc)
        # If you have a text-based LLM, you could use it here
        # blip2_output = text_llm_generate(f"Using the following context, answer the query:\n\nContext: {doc}\n\nQuery: {query}")
        # print("\nText LLM Output:", blip2_output)



---- Retrieved Segment ----
query with the corresponding key. 3.2.1 Scaled Dot-Product Attention We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The
Metadata: {'type': 'text', 'content': 'query with the corresponding key. 3.2.1 Scaled Dot-Product Attention We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The', 'page': 3}

Retrieved Text Context: query with the corresponding key. 3.2.1 Scaled Dot-Product Attention We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The

---- Retrieved Segment ----
attention [2], and dot-product (multi- plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of
Metadata: {'content': 'attention [2], and dot-product (multi- plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of', 'page': 4, 'type': 'text'}

Retrieved Text Context: attention [2], and dot-product (multi- plicativ

LOAD THE **Salesforce/blip2-opt-6.7b Model**, this is totally for **Vision part**. That is for the given input image generate the text as output.

Function: Load BLIP-2 with quantization fallbacks and generate descriptions from an image with an optional text prompt.

In [None]:
# robust BLIP-2 loader + image+text generation (copy-paste)
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration, BitsAndBytesConfig
from PIL import Image
import traceback

model_id = "Salesforce/blip2-opt-6.7b"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

def load_blip2_with_fallback(model_id):
    # Try 4-bit, then 8-bit, then float16/32
    try:
        print("Trying 4-bit (bitsandbytes)...")
        bnb_4bit = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        model = Blip2ForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=bnb_4bit,
            device_map="auto",
            trust_remote_code=True
        )
        print("Loaded in 4-bit.")
        return model
    except Exception as e:
        print("4-bit failed:", e)
        traceback.print_exc()

    try:
        print("Trying 8-bit (bitsandbytes)...")
        bnb_8bit = BitsAndBytesConfig(load_in_8bit=True)
        model = Blip2ForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=bnb_8bit,
            device_map="auto",
            trust_remote_code=True
        )
        print("Loaded in 8-bit.")
        return model
    except Exception as e:
        print("8-bit failed:", e)
        traceback.print_exc()

    try:
        print("Trying float16/float32 device_map='auto' ...")
        dtype = torch.float16 if device == "cuda" else torch.float32
        model = Blip2ForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=dtype,
            device_map="auto" if device=="cuda" else None,
            trust_remote_code=True
        )
        print("Loaded in float16/float32.")
        return model
    except Exception as e:
        print("float load failed:", e)
        traceback.print_exc()

    raise RuntimeError("All loading strategies failed.")

# Load processor + model
print("Loading processor (Blip2Processor) ...")
processor = Blip2Processor.from_pretrained(model_id, trust_remote_code=True)
print("Loading model ... (this can take a while)")
model = load_blip2_with_fallback(model_id)
model.eval()

model_device = next(model.parameters()).device
print("Model device:", model_device)

# Helper to generate + debug
def describe_image(image_path, prompt="", do_sample=False, max_new_tokens=200):
    image = Image.open(image_path).convert("RGB")
    # Prepare inputs (NOT moving to device yet so we can inspect tokens)
    inputs = processor(images=image, text=prompt, return_tensors="pt")
    # Debug: token ids for the prompt (if present)
    if "input_ids" in inputs:
        tok = inputs["input_ids"][0]
        print("input_ids (first 50):", tok[:50].tolist())
        # quick pad-check:
        pad_id = processor.tokenizer.pad_token_id
        if pad_id is not None and torch.all(tok == pad_id):
            print("WARNING: all prompt tokens are PAD (token id = {}).".format(pad_id))

    # Move to model device and correct dtype
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate (with optional sampling if deterministic output is echoing)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            top_p=0.9 if do_sample else None,
            temperature=0.7 if do_sample else None,
            use_cache=True
        )
    decoded = processor.tokenizer.decode(out[0], skip_special_tokens=True)
    return decoded

# Example run - update path if needed
image_path = "/content/TRANSFORMER.png"   # your local image
prompt = "Explain this diagram in detail, focusing on how multi-head attention works."

print("\n=== Running image+prompt generation ===")
result = describe_image(image_path, prompt=prompt, do_sample=True, max_new_tokens=250)
print("\n=== Generated ===\n", result)

Device: cpu
Loading processor (Blip2Processor) ...
Loading model ... (this can take a while)
Trying 4-bit (bitsandbytes)...
4-bit failed: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`
Trying 8-bit (bitsandbytes)...


Traceback (most recent call last):
  File "/tmp/ipython-input-2371346253.py", line 21, in load_blip2_with_fallback
    model = Blip2ForConditionalGeneration.from_pretrained(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py", line 316, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py", line 4879, in from_pretrained
    hf_quantizer.validate_environment(
  File "/usr/local/lib/python3.11/dist-packages/transformers/quantizers/quantizer_bnb_4bit.py", line 76, in validate_environment
    raise ImportError(
ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`


8-bit failed: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`
Trying float16/float32 device_map='auto' ...


Traceback (most recent call last):
  File "/tmp/ipython-input-2371346253.py", line 36, in load_blip2_with_fallback
    model = Blip2ForConditionalGeneration.from_pretrained(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py", line 316, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py", line 4879, in from_pretrained
    hf_quantizer.validate_environment(
  File "/usr/local/lib/python3.11/dist-packages/transformers/quantizers/quantizer_bnb_8bit.py", line 73, in validate_environment
    raise ImportError(
ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loaded in float16/float32.
Model device: cpu

=== Running image+prompt generation ===
input_ids (first 50): [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 2, 43043, 1851, 42, 41071, 11, 4617, 6, 5650, 15, 141, 3228, 12, 3628, 1503, 1364, 4]

=== Generated ===
 Explain this diagram in detail, focusing on how multi-head attention works.



# **GENERATION OF TEXT, BASED ON OUR INPUT IMAGE**
Produce a generic caption from the image alone, or answer a question about the image using a “Question: … Answer"

In [None]:
# Image-only captioning
result = describe_image(
    "/content/TRANSFORMER.png",
    prompt="",  # empty for pure caption
    do_sample=True,
    max_new_tokens=200
)
print("\n=== Image-only Caption ===\n", result)

# Image+prompt Q&A style
prompt = "Question: Explain this diagram in detail, focusing on how multi-head attention works. Answer:"
result_qna = describe_image(
    "/content/TRANSFORMER.png",
    prompt=prompt,
    do_sample=True,
    max_new_tokens=250
)
print("\n=== Q&A Caption ===\n", result_qna)

input_ids (first 50): [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 2]

=== Image-only Caption ===
 the block diagram for a data processing system

input_ids (first 50): [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 2, 45641, 35, 44109, 42, 41071, 11, 4617, 6, 5650, 15, 141, 3228, 12, 3628, 1503, 1364, 4]

=== Q&A Caption ===
 Question: Explain this diagram in detail, focusing on how multi-head attention works. Answer: Multi-head attention is the ability of a computer to recognize the presence of more than one object at the same time



# **GENERATION OF TEXT BASED ON OUR INPUT(WHICH IS TEXT)**  
For this we laod another model and that is GPT-2, which is lightweight and works pretty well.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# ------------------------
# Load BLIP2 (already loaded earlier)
# ------------------------
# model, processor = ... # You already have this part

# ------------------------
# Load lightweight text model for RAG text generation
# ------------------------
text_model_name = "gpt2"  # small & fast, replace with bigger if GPU available
text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
text_model = AutoModelForCausalLM.from_pretrained(text_model_name).to("cpu")

# ------------------------
# Function to generate from text-only context
# ------------------------
def generate_text_only(context, query, max_tokens=80):
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    inputs = text_tokenizer(prompt, return_tensors="pt").to("cpu")
    output_ids = text_model.generate(**inputs, max_new_tokens=max_tokens)
    return text_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ------------------------
# Main RAG pipeline
# ------------------------
def rag_pipeline(query, retrieved_docs):
    """
    retrieved_docs: list of dicts
      Each dict has:
        - type: 'image' or 'text'
        - content: path (if image) or text string
    """
    answers = []
    for doc in retrieved_docs:
        if doc['type'] == 'image':
            image = Image.open(doc['content'])
            inputs = processor(images=image, text=query, return_tensors="pt").to("cpu")
            output_ids = model.generate(**inputs, max_new_tokens=50)
            description = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
            answers.append({"type": "image", "output": description})

        elif doc['type'] == 'text':
            answer = generate_text_only(doc['content'], query)
            answers.append({"type": "text", "output": answer})

    return answers

# ------------------------
# Example Usage
# ------------------------
retrieved_docs = [
    {"type": "text", "content": "Scaled dot-product attention is computed as QK^T / sqrt(d_k) ..."}
]

query = "Explain the transformer architecture"
results = rag_pipeline(query, retrieved_docs)

for res in results:
    print(f"[{res['type'].upper()} RESULT] {res['output']}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[TEXT RESULT] Context: Scaled dot-product attention is computed as QK^T / sqrt(d_k) ...

Question: Explain the transformer architecture
Answer: The transformer architecture is a set of functions that are used to compute the magnitude of the product of the two products. The functions are:

QK^T = QK^T / sqrt(d_k)

where QK^T is the product of the two products.

The function QK^T is a function that takes a product and returns a product
