In [104]:
import fitz
from PIL import Image
import io
import os

def load_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    texts = []
    images = []

    for page in doc:
        # Extract text
        text = page.get_text()
        if text.strip():
            texts.append(text)

        # Extract images
        for img in page.get_images(full=True):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            images.append(image)

    return texts, images


In [105]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

def chunk_text(texts):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )

    documents = []
    for text in texts:
        chunks = splitter.split_text(text)
        for chunk in chunks:
            documents.append(Document(page_content=chunk))

    return documents


In [106]:
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [107]:
def embed_texts(texts):
    inputs = clip_processor(
        text=texts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)

    with torch.no_grad():
        embeddings = clip_model.get_text_features(**inputs)

    return embeddings.cpu().numpy()


In [108]:
def embed_images(images):
    if len(images) == 0:
        print("⚠️ No images found in document.")
        return np.empty((0, 512))  # CLIP image embedding size

    inputs = clip_processor(
        images=images,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        embeddings = clip_model.get_image_features(**inputs)

    return embeddings.cpu().numpy()


In [109]:
from langchain_community.vectorstores import Chroma
import chromadb

def build_vectorstore(documents, embeddings):
    texts = [doc.page_content for doc in documents]
    ids = [f"doc_{i}" for i in range(len(texts))]

    client = chromadb.Client()

    # Delete if exists (safe)
    try:
        client.delete_collection("multimodal_text")
    except:
        pass

    collection = client.create_collection(name="multimodal_text")

    collection.add(
        ids=ids,
        documents=texts,
        embeddings=embeddings.tolist()
    )

    vectorstore = Chroma(
        client=client,
        collection_name="multimodal_text",
        embedding_function=None
    )

    return vectorstore


In [113]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def retrieve(query, vectorstore, image_embeddings, images, top_k=3):
    # Embed query
    query_emb = embed_texts([query])

    # -------- TEXT RETRIEVAL --------
    results = vectorstore._collection.get(include=["embeddings", "documents"])
    text_embs = np.array(results["embeddings"])

    text_scores = cosine_similarity(query_emb, text_embs)[0]
    top_text_idx = text_scores.argsort()[-top_k:][::-1]
    retrieved_texts = [results["documents"][i] for i in top_text_idx]

    # -------- IMAGE RETRIEVAL (SAFE) --------
    retrieved_images = []

    if image_embeddings is not None and len(image_embeddings) > 0:
        image_scores = cosine_similarity(query_emb, image_embeddings)[0]
        top_image_idx = image_scores.argsort()[-top_k:][::-1]
        retrieved_images = [images[i] for i in top_image_idx]
    else:
        print("ℹ️ No images available for retrieval.")

    return retrieved_texts, retrieved_images


In [114]:
from langchain.chat_models import init_chat_model
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage

from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

hf_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",  # smaller, less storage
    max_new_tokens=150
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

prompt = PromptTemplate(
    template="""
You are a helpful AI assistant.

Use ONLY the information in the context to answer the question.
If the answer is not present, say "Not found in the document".

Context:
{context}

Question:
{question}

Answer in 2–3 complete sentences:
""",
    input_variables=["context", "question"]
)


Device set to use cpu


In [119]:
pdf_path = "pdf/p1.pdf"
texts, images = load_pdf(pdf_path)

documents = chunk_text(texts)

text_embeddings = embed_texts([doc.page_content for doc in documents])
image_embeddings = embed_images(images)

vectorstore = build_vectorstore(documents, text_embeddings)

query ="What are Neural Networks"




retrieved_texts, retrieved_images = retrieve(
    query,
    vectorstore,
    image_embeddings,
    images
)


    


context = "\n".join(retrieved_texts)

final_prompt = prompt.format(
    
    context=context,
    question=query
)

response = llm.invoke(final_prompt)

print("\n✅ Final Answer:\n")
print(response.strip())


⚠️ No images found in document.
ℹ️ No images available for retrieval.

✅ Final Answer:

Inspired by the structure of the human brain. They consist of interconnected nodes called neurons, organized into layers: input layer, hidden layers, and output layer. Each neuron performs a weighted sum of inputs, adds a bias, and applies an activation function. Training a neural network involves adjusting weights using optimization algorithms such as Gradient Descent and Backpropagation. Deep Learning (DL) and reinforcement learning. Supervised learning uses labeled data, unsupervised learning works with unlabeled Neural Networks are inspired by the structure of the human brain. They consist of interconnected nodes called neurons, organized into layers: input layer, hidden layers, and output


In [120]:
from IPython.display import display

for img in retrieved_images:
    display(img)
