<a href="https://colab.research.google.com/github/Abhineetsahay/multimodal-rag-academic-notes/blob/main/RAG_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
!pip install numpy pillow pymupdf pytesseract opencv-python open-clip-torch sentence-transformers chromadb google-generativeai langchain langchain-community langchain-google-genai



In [38]:
import torch
import open_clip
from PIL import Image

In [39]:
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="ViT-B-32",
    pretrained="openai"
)

tokenizer = open_clip.get_tokenizer("ViT-B-32")
model.eval()



CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [40]:
import fitz
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
import uuid

In [41]:
class CreateEmbedding:
    def __init__(self, model, preprocess, tokenizer):
        self.model = model
        self.preprocess = preprocess
        self.tokenizer = tokenizer

    def embed_img(self, img_path):
        if isinstance(img_path, str):
            image = Image.open(img_path).convert("RGB")
        else:
            image = img_path

        image_tensor = self.preprocess(image).unsqueeze(0)

        with torch.no_grad():
            image_embedding = self.model.encode_image(image_tensor)
            image_embedding /= image_embedding.norm(dim=-1, keepdim=True)
        return image_embedding

    def embed_text(self, text):
        text_tokens = self.tokenizer([str(text)])

        with torch.no_grad():
            text_embedding = self.model.encode_text(text_tokens)
            text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
            return text_embedding

In [42]:
class LoadFile:
    def __init__(self, path):
        self.path = path
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=600,
            chunk_overlap=100,
        )

    def openFile(self):
        doc = fitz.open(self.path)
        print(f"Successfully opened: {self.path}")

        all_text = []
        all_images = []

        for page_index in range(len(doc)):
            page = doc[page_index]

            page_text = page.get_text().strip()
            if page_text:
                all_text.append(page_text)

            image_list = page.get_images(full=True)
            for img in image_list:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image = Image.open(io.BytesIO(base_image["image"])).convert("RGB")
                all_images.append(image)

        return all_text, all_images

    def get_chunks(self, all_text):
        chunks = []
        for page_num, text in enumerate(all_text):
            page_chunks = self.text_splitter.split_text(text)
            for chunk in page_chunks:
                chunks.append({"content": chunk, "page": page_num})
        return chunks


In [46]:
loader = LoadFile("/content/Ch4 Software Project Management_3.pptx")
all_text, all_images = loader.openFile()
text_chunks = loader.get_chunks(all_text)


Successfully opened: /content/Ch4 Software Project Management_3.pptx


In [47]:
embeddings=CreateEmbedding(
    model,
    preprocess,
    tokenizer
)

client = chromadb.PersistentClient(path="./chroma_db")

collection = client.get_or_create_collection(name="lecture_notes")

all_ids = []
all_embeddings_vecs = []
all_metadatas = []
all_documents = []

for item in text_chunks:
    embedding = embeddings.embed_text(item["content"])


    embedding_list = embedding.cpu().detach().numpy().flatten().tolist()

    all_ids.append(str(uuid.uuid4()))
    all_embeddings_vecs.append(embedding_list)
    all_metadatas.append({"type": "text", "page": item["page"]})
    all_documents.append(item["content"])


for i, img in enumerate(all_images):
    embedding = embeddings.embed_img(img)
    embedding_list = embedding.cpu().detach().numpy().flatten().tolist()

    all_ids.append(str(uuid.uuid4()))
    all_embeddings_vecs.append(embedding_list)
    all_metadatas.append({"type": "image", "image_index": i})
    all_documents.append(f"Image_{i}")

collection.add(
    ids=all_ids,
    embeddings=all_embeddings_vecs,
    metadatas=all_metadatas,
    documents=all_documents
)

print(f"Successfully stored {len(all_ids)} items in ChromaDB.")

Successfully stored 27 items in ChromaDB.


In [54]:
query_text = "What is Intermediate COCOMO Model?"
query_embedding = embeddings.embed_text(query_text).cpu().detach().numpy().flatten().tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

print(f"Query: {query_text}\n")
print("Top Matches found in PDF:")
print("-" * 30)

for i in range(len(results['documents'][0])):
    content = results['documents'][0][i]
    page = results['metadatas'][0][i]['page']
    dist = results['distances'][0][i]

    print(f"Match #{i+1} (Distance: {dist:.4f})")
    print(f"Content: {content}")
    print(f"Found on Page: {page}")
    print("-" * 30)

Query: What is Intermediate COCOMO Model?

Top Matches found in PDF:
------------------------------
Match #1 (Distance: 0.3214)
Content: 3.0
1.12
Embedded
2.8
1.20
Intermediate COCOMO Model
Chapter 4- Software Project Management
Q.1 A software company needs to develop a project that is
estimated as 1000 function points and is planning to use JAVA as
the programming language whose approximate lines of code per
function point is accepted as 50. Considering 50 as multiplicative
factor, approximately how long does project take to complete?
Complete COCOMO Model
Most large systems are made up of several smaller sub-systems of
organic type, semidetached, and even embedded type.
Found on Page: 3
------------------------------
Match #2 (Distance: 0.3396)
Content: PM = a * Sizeb *M where
M = Multipliers;
a = 2.94 in initial calibration,
Size in KLOC,
b varies from 1.1 to 1.24 depending on novelty of the project,
development flexibility, risk management approaches and the
process maturity.*post-

In [52]:
unique_contexts = []
seen = set()

for content in results['documents'][0]:
    clean_text = content.strip()
    if clean_text not in seen:
        unique_contexts.append(clean_text)
        seen.add(clean_text)

print(f"‚ú® **Answer for:** {query_text}")
print("="*40)

if not unique_contexts:
    print("I'm sorry, I couldn't find any specific information about that in the lecture notes.")
else:
    print("Based on your lecture notes, here is what I found:\n")
    for i, context in enumerate(unique_contexts):
        display_text = " ".join(context.split())
        print(f"üìç **Point {i+1}:** {display_text}\n")

    pages = sorted(list(set([m['page'] for m in results['metadatas'][0]])))
    print(f"üìö **Sources:** Page(s) {', '.join(map(str, pages))}")

‚ú® **Answer for:** What is Intermediate COCOMO Model?
Based on your lecture notes, here is what I found:

üìç **Point 1:** 3.0 1.12 Embedded 2.8 1.20 Intermediate COCOMO Model Chapter 4- Software Project Management Q.1 A software company needs to develop a project that is estimated as 1000 function points and is planning to use JAVA as the programming language whose approximate lines of code per function point is accepted as 50. Considering 50 as multiplicative factor, approximately how long does project take to complete? Complete COCOMO Model Most large systems are made up of several smaller sub-systems of organic type, semidetached, and even embedded type.

üìç **Point 2:** PM = a * Sizeb *M where M = Multipliers; a = 2.94 in initial calibration, Size in KLOC, b varies from 1.1 to 1.24 depending on novelty of the project, development flexibility, risk management approaches and the process maturity.*post-architecture model COCOMO-2: Early Design Model Chapter 4- Software Project Ma