In [None]:
%pip install -r requirements.txt

In [None]:
import io
import base64
import time
import warnings
import numpy as np
import pymupdf
from PIL import Image
from langchain.schema import HumanMessage, StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from dotenv import load_dotenv
warnings.filterwarnings("ignore")

In [None]:

load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
chroma_db_path = os.getenv("CHROMA_DB_PATH")
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp")

In [None]:
def encode_image_to_base64(image):
    """Convert PIL Image to base64 string with data URI"""
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return f"data:image/png;base64,{img_str}"

In [None]:
# Convert image to base64 with proper data URI
def generate_text_from_image(image):
  # Define the prompt
  template = f"""
  You are an expert at extracting and explaining the content of an image.
  - Extract **all textual content** from the image.
  - Detect and describe **tables, graphs, charts, and their relationships**.
  - If the image contains a **pipeline**, explain its **step-by-step process**.
  - Maintain formatting for tables and key visual structures.
  """
  messages = [HumanMessage(content=[{"type": "text", "text": template}, {"type": "image_url", "image_url": image}])]
  response = llm(messages)
  return response.content


In [None]:
# Open the PDF
doc = pymupdf.open("Lecture 2 - Deep Feedforward Networks.pdf")
texts = []
page_content = []


# Loop through each page
for page_num in range(len(doc)):
    page = doc.load_page(page_num)

    # Extract text from the page
    text = page.get_text()
    texts.append(text)  # Store the text for each page

    # Get the images on the page
    img_list = page.get_images(full=True)

    for img_index, img in enumerate(img_list):
        xref = img[0]  # XREF of the image
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]  # Image bytes

        # Save the image
        image = Image.open(io.BytesIO(image_bytes))
        image_base64 = encode_image_to_base64(image)

        # Introduce error handling and rate limiting
        try:
            gen_text = generate_text_from_image(image_base64)
            text += gen_text
        except Exception as e:
            print(f"Error processing image on page {page_num}, image index {img_index}: {e}")

        # delay to respect rate limits
        time.sleep(10)

    page_content.append(text)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 300
)
combined_text = " ".join(page_content)

# Now split the combined text
texts = text_splitter.split_text(combined_text)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def get_text_embedding(text):
  embedding = model.encode(text)
  return embedding
embeddings_text = []
# Loop through each page to extract text and images, then embed them
for text in texts:
    # Extract text and embed it
    text_embedding = get_text_embedding(text)  # Get text embedding
    embeddings_text.append(text_embedding)  # Store in the dictionary



In [None]:
chroma_settings = Settings(
    chroma_db_impl="duckdb+parquet",  # Use DuckDB with Parquet for persistence
    persist_directory=chroma_db_path,  # Use the path from .env
    anonymized_telemetry=False        # Disable telemetry (optional)
)
# Initialize ChromaDB client
client = chromadb.Client(chroma_settings)

# Create or load a collection
text_collection = client.get_or_create_collection(name="pdf_text_embedding")


In [None]:
for page_num, page_text in enumerate(page_content):
    combined_text = " ".join(page_text)
    texts = text_splitter.split_text(combined_text)

    # Add metadata for each chunk
    for i, text in enumerate(texts):
        text_embedding = embeddings_text[i]
        text_collection.add(
            documents=[text],
            embeddings=[text_embedding],
            metadatas=[{"type": "text", "page_number": page_num + 1}],
            ids=[f"page_{page_num + 1}_text_{i + 1}"]
        )

In [None]:
def generate_questions(query):
    text_collections = client.get_collection("pdf_text_embedding")
    content = text_collections.get()["documents"]

    # Template for generating questions from the PDF content
# Template for generating questions from the PDF content
    template = f"""
    You are an AI assistant that generates Multiple Choice Questions (MCQs) based on the content of a given PDF document.

    ### User Query:
    "{query}"

    ### Context:
    The PDF document contains the following content (excluding the lecture overview):
    {content}

    ### Instructions:
    - Carefully analyze the content of the provided PDF, but **ignore the lecture overview** as it may contain titles that will be explained later.
    - Generate MCQs based on the textual content, ensuring that the questions are **specific**, **clear**, and **focused on content** without referencing any metadata of the PDF or the university.

    ### Output Format:
    Provide a list of MCQs like this:
    1. Question: What was the total increase in sales between Q1 and Q2?
      - a) $1,000
      - b) $1,500
      - c) $2,000
      - d) $2,500
      - Correct answer: c)

    Now, generate the MCQs:
    """


    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp")
    messages = [HumanMessage(content=template)]  # Create a HumanMessage object
    response = llm(messages)  # Call the llm with the list of messages
    return [{"generated_text": response.content}]  # Extract the content from the response

query = "I want 40 questions"
result = generate_questions(query)
print(StrOutputParser().parse(result[0]['generated_text']))
