In [174]:
# install dependencies first (if not done)
# pip install pymupdf sentence-transformers faiss-cpu

import fitz  # PyMuPDF (fast PDF reader)
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


# ------------------------------
# Step 1. Load PDF and stream text
# ------------------------------
def process_pdf(path, chunk_size=500, overlap=50):
    doc = fitz.open(path)
    for page in doc:
        text = page.get_text("text")
        if not text:
            continue
        words = text.split()
        for i in range(0, len(words), chunk_size - overlap):
            yield " ".join(words[i:i+chunk_size])


# ------------------------------
# Step 2. Create chunks + embeddings
# ------------------------------
def build_embeddings(path, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    chunks, embeddings = [], []

    for chunk in process_pdf(path):
        chunks.append(chunk)
        emb = model.encode([chunk])
        embeddings.append(emb[0])

    embeddings = np.array(embeddings)
    return chunks, embeddings, model


# ------------------------------
# Step 3. Build FAISS index
# ------------------------------
def build_faiss_index(embeddings):
    d = embeddings.shape[1]  # embedding dimension
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    return index


# ------------------------------
# Step 4. Query the chatbot
# ------------------------------
def query_bot(query, model, chunks, index, k=3):
    q_emb = model.encode([query])
    D, I = index.search(q_emb, k)
    results = [chunks[idx] for idx in I[0]]
    return results


# ------------------------------
# Run on your PDF (example: covid19.pdf)
# ------------------------------
pdf_path = "py.pdf"  # <-- replace with your 18-page PDF

# Build embeddings
chunks, embeddings, model = build_embeddings(pdf_path)

# Save to disk (optional, reuse later)
np.save("py.npy", embeddings, allow_pickle=False)

# Build index
index = build_faiss_index(embeddings)




In [170]:
def get_all_answers(questions):
    all_answers = []
    for question in questions:
        answers = query_bot(question, model, chunks, index, k=3)
        all_answers.append(answers)
    return all_answers

In [184]:
question =["what are the different data types in python?"]

text_to_summarize = get_all_answers(question)

In [185]:
text_to_summarize 

[['Python Tutorial, Release 3.7.0 42 Chapter 5. Data Structures',
  'Python Tutorial, Release 3.7.0 Python is an easy to learn, powerful programming language. It has eﬃcient high-level data structures and a simple but eﬀective approach to object-oriented programming. Python’s elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms. The Python interpreter and the extensive standard library are freely available in source or binary form for all major platforms from the Python Web site, https://www.python.org/, and may be freely distributed. The same site also contains distributions of and pointers to many free third party Python modules, programs and tools, and additional documentation. The Python interpreter is easily extended with new functions and data types implemented in C or C++ (or other languages callable from C). Python is also suitable as an extension langua

In [None]:
# --- Gemini Summarization Utility ---
"""Summarize arbitrary text using Google Gemini API.
Prerequisites:
1. pip install google-generativeai (if not installed)
2. Set environment variable GEMINI_API_KEY with your key.

Usage:
- Provide text in `text_to_summarize` or replace with your own string.
- Adjust model name or parameters as needed.
"""

import os, textwrap, json

try:
    import google.generativeai as genai
except ImportError:
    raise ImportError("google-generativeai not installed. Install with: pip install google-generativeai")

API_KEY = 'I am not telling u'  # <-- replace with your Gemini API key or set env variable GEMINI_API_KEY
if not API_KEY:
    raise EnvironmentError('Set GEMINI_API_KEY environment variable before running this cell.')

genai.configure(api_key=API_KEY)
MODEL_NAME = 'gemini-1.5-flash'

# Use the answers variable (already available) for summarization


prompt = f"""
Summarize the following content as bullet and focus on the original question asked {question[0]} if {question[0]} is an mcq provide the right option to pick if question is not mcq just provide one word answer.
Text:
{text_to_summarize}
"""

gemini_model = genai.GenerativeModel(MODEL_NAME)
response = gemini_model.generate_content(prompt)

summary = response.text 
print('\nSummary:\n')
print(summary)



Summary:

* **Numbers:**  Integers, floating-point numbers, complex numbers.
* **Strings:** Text enclosed in single (' ') or double (" ") quotes.
* **Lists:** Ordered, mutable (changeable) sequences of items.
* **Tuples:** Ordered, immutable (unchangeable) sequences of items.
* **Dictionaries:** Unordered collections of key-value pairs.


If the question were multiple choice, the best option would be the one that includes all of the above.

