In [None]:
!pip install sentence-transformers transformers torch torchvision ftfy pdfplumber python-docx matplotlib

In [4]:
import os
import torch
from PIL import Image
from torchvision import transforms
from sentence_transformers import SentenceTransformer, util
from transformers import CLIPProcessor, CLIPModel
import pdfplumber
from docx import Document
import matplotlib.pyplot as plt

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# Load CLIP for images
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load Sentence-BERT for text
text_model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

# Store all embeddings and metadata
embedding_db = []

In [10]:
def extract_text(file_path):
    ext = file_path.lower().split(".")[-1]
    if ext == "pdf":
        with pdfplumber.open(file_path) as pdf:
            return "\n".join(page.extract_text() or '' for page in pdf.pages)
    elif ext == "docx":
        doc = Document(file_path)
        return "\n".join(p.text for p in doc.paragraphs)
    elif ext in {"txt", "md"}:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    else:
        return None

In [None]:
def embed_file(file_path):
    ext = file_path.lower().split(".")[-1]

    if ext in {"jpg", "jpeg", "png", "bmp", "webp"}:
        # Image Embedding using CLIP
        image = Image.open(file_path).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            image_emb = clip_model.get_image_features(**inputs)
        image_emb = image_emb / image_emb.norm(p=2)
        embedding_db.append({
            "type": "image",
            "path": file_path,
            "embedding": image_emb.cpu()
        })
        print(f"Image embedded: {file_path}")

    elif ext in {"pdf", "docx", "txt", "md"}:
        # Text Embedding using BERT
        text = extract_text(file_path)
        if text:
            text_emb = text_model.encode(text, convert_to_tensor=True)
            embedding_db.append({
                "type": "text",
                "path": file_path,
                "text": text,
                "embedding": text_emb.cpu()
            })
            print(f"Text embedded: {file_path}")
        else:
            print(f"Could not extract text: {file_path}")
    else:
        print(f"Unsupported file type: {file_path}")

In [11]:
def search(query, top_k=3):
    print(f"\nSearching for: '{query}'")

    # Process query for CLIP
    clip_inputs = clip_processor(text=query, return_tensors="pt").to(device)
    with torch.no_grad():
        query_emb_img = clip_model.get_text_features(**clip_inputs)
    query_emb_img = query_emb_img / query_emb_img.norm(p=2)
    query_emb_img = query_emb_img.cpu()

    # Process query for SentenceTransformer
    query_emb_txt = text_model.encode(query, convert_to_tensor=True).cpu()

    results = []
    for item in embedding_db:
        if item["type"] == "image":
            score = util.pytorch_cos_sim(query_emb_img, item["embedding"])[0][0].item()
        elif item["type"] == "text":
            score = util.pytorch_cos_sim(query_emb_txt, item["embedding"])[0][0].item()
        else:
            continue
        results.append((score, item))

    top_results = sorted(results, key=lambda x: x[0], reverse=True)[:top_k]

    for i, (score, item) in enumerate(top_results):
        print(f"\nRank #{i+1} — Score: {score:.4f} — Type: {item['type']}")
        if item["type"] == "image":
            img = Image.open(item["path"])
            plt.imshow(img)
            plt.title(f"{item['path']} (Score: {score:.4f})")
            plt.axis("off")
            plt.show()
        elif item["type"] == "text":
            print(f"File: {item['path']}")
            print(f"Excerpt: {item['text'][:500]}...")


In [None]:
embed_file("Anime.png")
embed_file("OS.pdf")
embed_file("a.txt")

In [None]:
search("many people")
search("football")
search("me")
search("coding")