<a href="https://colab.research.google.com/github/Abhineetsahay/multimodal-rag-academic-notes/blob/main/RAG_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [93]:
!pip install numpy pillow pymupdf pytesseract opencv-python open-clip-torch sentence-transformers chromadb google-generativeai langchain langchain-community langchain-google-genai



In [94]:
import torch
import open_clip
from PIL import Image

In [95]:
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="ViT-B-32",
    pretrained="openai"
)

tokenizer = open_clip.get_tokenizer("ViT-B-32")
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [109]:
import fitz
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [110]:
class CreateEmbedding:
    def __init__(self, model, preprocess, tokenizer):
        self.model = model
        self.preprocess = preprocess
        self.tokenizer = tokenizer

    def embed_img(self, img_path):
        if isinstance(img_path, str):
            image = Image.open(img_path).convert("RGB")
        else:
            image = img_path

        image_tensor = self.preprocess(image).unsqueeze(0)

        with torch.no_grad():
            image_embedding = self.model.encode_image(image_tensor)
            image_embedding /= image_embedding.norm(dim=-1, keepdim=True)
        return image_embedding

    def embed_text(self, text):
        text_tokens = self.tokenizer([str(text)])

        with torch.no_grad():
            text_embedding = self.model.encode_text(text_tokens)
            text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
            return text_embedding

In [118]:
class LoadFile:
    def __init__(self, path):
        self.path = path
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=400,
            chunk_overlap=50,
        )

    def openFile(self):
        doc = fitz.open(self.path)
        print(f"Successfully opened: {self.path}")

        all_text = []
        all_images = []

        for page_index in range(len(doc)):
            page = doc[page_index]

            page_text = page.get_text().strip()
            if page_text:
                all_text.append(page_text)

            image_list = page.get_images(full=True)
            for img in image_list:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image = Image.open(io.BytesIO(base_image["image"])).convert("RGB")
                all_images.append(image)

        return all_text, all_images

    def get_chunks(self, all_text):
        chunks = []
        for page_num, text in enumerate(all_text):
            page_chunks = self.text_splitter.split_text(text)
            for chunk in page_chunks:
                chunks.append({"content": chunk, "page": page_num})
        return chunks


In [120]:
loader = LoadFile("/content/Lecture 3_ Neural Language Modeling.pdf")
all_text, all_images = loader.openFile()
text_chunks = loader.get_chunks(all_text)


Successfully opened: /content/Lecture 3_ Neural Language Modeling.pdf


In [121]:
embeddings=CreateEmbedding(
    model,
    preprocess,
    tokenizer
)

text_embeddings_list = []
print(f"Generating embeddings for {len(all_text)} text blocks...")

for i, text in enumerate(all_text):
    try:
        embedding = embeddings.embed_text(text)
        text_embeddings_list.append(embedding)
    except Exception as e:
        print(f"Error embedding text on block {i}: {e}")

image_embeddings_list = []
print(f"Generating embeddings for {len(all_images)} images...")

for i, img in enumerate(all_images):
    try:
        embedding = embeddings.embed_img(img)
        image_embeddings_list.append(embedding)
    except Exception as e:
        print(f"Error embedding image {i}: {e}")

print("--- Extraction & Embedding Complete ---")
print(f"Text Embeddings: {len(text_embeddings_list)}")
print(f"Image Embeddings: {len(image_embeddings_list)}")

Generating embeddings for 86 text blocks...
Generating embeddings for 181 images...
--- Extraction & Embedding Complete ---
Text Embeddings: 86
Image Embeddings: 181
