In [None]:
!pip install pymupdf transformers torch

In [None]:
import fitz  # PyMuPDF
import os
import json
from transformers import pipeline

# Define paths
pdf_folder = "/content/pdf"
output_dir = "/content/images"
os.makedirs(output_dir, exist_ok=True)

# Load summarization model
summarizer = pipeline("summarization", model="t5-small")

# Get list of PDF files
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

documents = []

# Iterate over each PDF
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    doc = fitz.open(pdf_path)

    # Iterate through pages
    for page_num in range(len(doc)):
        page = doc[page_num]
        images = page.get_images(full=True)

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            img_bytes = base_image["image"]
            img_ext = base_image["ext"]  # Image format

            # Save image in its original format
            image_filename = f"{output_dir}/{pdf_file}_page{page_num + 1}_img{img_index + 1}.{img_ext}"
            with open(image_filename, "wb") as img_file:
                img_file.write(img_bytes)

            # Extract surrounding text
            text = page.get_text("text").strip()
            text_lines = text.split("\n")
            surrounding_text = " ".join(text_lines[:10])  # Get first 10 lines

            # Generate summary if text is long enough
            if len(surrounding_text) > 50:
                summary = summarizer(surrounding_text, max_length=50, min_length=20, do_sample=False)[0]["summary_text"]
            else:
                summary = surrounding_text  # Use original if too short

            # Store extracted info
            documents.append({
                "pdf": pdf_file,
                "page": page_num + 1,
                "text": summary,
                "image": image_filename
            })

# Save structured documents to JSON
json_path = os.path.join(output_dir, "multimodal_documents.json")
with open(json_path, "w") as json_file:
    json.dump(documents, json_file, indent=4)

print(f"✅ Processing completed. JSON saved in {json_path}")


In [None]:
# Verify document
documents

In [None]:
!pip install fastembed

In [None]:
from fastembed import TextEmbedding, ImageEmbedding

class EmbedData:
    def __init__(self,
                 documents,
                 text_model_name="Qdrant/clip-ViT-B-32-text",
                 image_model_name="Qdrant/clip-ViT-B-32-vision"):

        # Initialize text embedding model
        self.documents = documents
        self.text_model = TextEmbedding(model_name=text_model_name)
        text_description = self.text_model._get_model_description(text_model_name)
        self.text_embed_dim = text_description.dim  # Use attribute access

        # Initialize image embedding model
        self.image_model = ImageEmbedding(model_name=image_model_name)
        image_description = self.image_model._get_model_description(image_model_name)
        self.image_embed_dim = image_description.dim  # Use attribute access


    def embed_texts(self, texts):
      text_embeddings = list(self.text_model.embed(texts))
      return text_embeddings

    def embed_images(self, images):
      image_embeddings = list(self.image_model.embed(images))
      return image_embeddings

embeddata = EmbedData(documents)

embeddata.text_embeds = embeddata.embed_texts([doc["text"]for doc in documents])
embeddata.image_embeds = embeddata.embed_images([doc["image"]for doc in documents])

In [None]:
from google.colab import files
import shutil

# First, compress your folder into a zip file
folder_name = "/content/images"  # Replace with your folder name
zip_file = f"{folder_name}.zip"

# Create a zip archive of the folder
shutil.make_archive(folder_name, 'zip', folder_name)

# Download the zip file
files.download(zip_file)

In [None]:
!pip install qdrant-client

In [None]:
from qdrant_client import QdrantClient, models

class QdrantVDB:
    def __init__(self,
                 collection_name,
                 image_dim,
                 text_dim,
                 url="",
                 api_key=""):  # Add api_key for cloud access

        self.image_dim = image_dim
        self.text_dim = text_dim
        self.collection_name = collection_name

        # Initialize Qdrant client for cloud with API key
        self.client = QdrantClient(
            url=url,
            api_key=api_key,  # This is mandatory for Qdrant Cloud
            prefer_grpc=True
        )

    def create_collection(self):
        if not self.client.collection_exists(self.collection_name):

            print(f"Creating collection '{self.collection_name}'...")

            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config={
                    "image": models.VectorParams(size=self.image_dim,
                                                 distance=models.Distance.COSINE),
                    "text": models.VectorParams(size=self.text_dim,
                                                distance=models.Distance.COSINE),
                }
            )

            print(f"Collection '{self.collection_name}' created successfully.")
        else:
            print(f"Collection '{self.collection_name}' already exists.")

    def upload_embeddings(self, embeddata):
        print(f"Uploading points to collection '{self.collection_name}'...")

        points = []

        for idx, doc in enumerate(embeddata.documents):
            point = models.PointStruct(
                id=idx,  # Unique ID for each point
                vector={
                    "text": embeddata.text_embeds[idx],
                    "image": embeddata.image_embeds[idx],
                },
                payload=doc  # Original image and its caption
            )

            points.append(point)

        self.client.upload_points(collection_name=self.collection_name, points=points)

        print(f"Uploaded {len(points)} points to collection '{self.collection_name}'.")

In [None]:
vector_db = QdrantVDB("Chapter1_Edubot",
                      embeddata.image_embed_dim,
                      embeddata.text_embed_dim)

vector_db.create_collection()

vector_db.upload_embeddings(embeddata)

In [None]:
class Retriever:

    def __init__(self, vector_db, embeddata):
        self.vector_db = vector_db
        self.embeddata = embeddata

    def search(self, query, limit=3):
        query_embedding = list(self.embeddata.embed_texts(query))[0]

        result = self.vector_db.client.search(
            collection_name=self.vector_db.collection_name,
            query_vector=("image", query_embedding),
            with_payload=["image", "text"],
            limit=limit
        )

        return result


In [None]:
def Img_retriever(query):
  result = Retriever(vector_db, embeddata).search(query, limit=1)

  for i in result:
    print(i.payload["text"])

    display(PIL.Image.open(i.payload["image"]))

In [None]:
from PIL import Image
import PIL

query = "What are Agricultural Implements"

result = Retriever(vector_db, embeddata).search(query, limit=1)

for i in result:
    print(i.payload["text"])

    display(PIL.Image.open(i.payload["image"]))