In [None]:
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from pypdf import PdfReader
from langchain_pinecone import PineconeRerank
from langchain_core.documents import Document
import spacy

# -------------------- CONFIGURATION --------------------

# Load API keys from your environment (load via .env or export manually)
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
assert OPENAI_API_KEY and PINECONE_API_KEY, "Please set OPENAI_API_KEY and PINECONE_API_KEY in your environment"

# Index & Namespace
INDEX_NAME = "rag-chunking-eval"
NAMESPACE = "semantic"

# Model Names
EMBED_MODEL = "text-embedding-ada-002"
RERANK_MODEL = "bge-reranker-v2-m3"

# Query Sample
TOP_K = 5
RERANK_TOP_K = 2

# -------------------- INITIALIZE --------------------

# OpenAI client
openai = OpenAI()

# Pinecone client & index
pc = Pinecone(api_key=PINECONE_API_KEY)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
pine_index = pc.Index(INDEX_NAME)

# Reranker
reranker = PineconeRerank(model=RERANK_MODEL)

# -------------------- CHUNKING SETUP --------------------

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def semantic_chunking(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    chunks, current_chunk, current_length = [], [], 0

    for sentence in sentences:
        token_length = len(sentence.split())
        if current_length + token_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            overlap_sentences = current_chunk[-overlap:] if overlap < len(current_chunk) else current_chunk
            current_chunk, current_length = overlap_sentences[:], sum(len(s.split()) for s in overlap_sentences)
        current_chunk.append(sentence)
        current_length += token_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# -------------------- PDF LOADER --------------------

def load_pdf_text(pdf_path: str) -> str:
    """Extracts all selectable text pages from a PDF."""
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n"
    return full_text

# -------------------- INDEXING --------------------

def index_semantic_chunks(pdf_path: str):
    """Extract text, chunk, embed, and index into Pinecone."""
    raw_text = load_pdf_text(pdf_path)
    chunks = semantic_chunking(raw_text)
    print(f"🔍 Indexing {len(chunks)} chunks under namespace '{NAMESPACE}'")

    batch = []
    for idx, chunk in enumerate(chunks):
        emb = openai.embeddings.create(input=[chunk], model=EMBED_MODEL).data[0].embedding
        batch.append({
            "id": f"{NAMESPACE}-{idx}",
            "values": emb,
            "metadata": {"text": chunk}
        })
        # Upsert in batches of 100
        if len(batch) == 100:
            pine_index.upsert(vectors=batch, namespace=NAMESPACE)
            batch.clear()

    if batch:
        pine_index.upsert(vectors=batch, namespace=NAMESPACE)

# -------------------- RETRIEVAL --------------------

def retrieve_relevant_chunks(query: str, use_rerank: bool = True) -> list[Document]:
    q_emb = openai.embeddings.create(input=[query], model=EMBED_MODEL).data[0].embedding
    resp = pine_index.query(
        vector=q_emb,
        top_k=TOP_K,
        namespace=NAMESPACE,
        include_metadata=True
    )
    docs = [Document(page_content=m["metadata"]["text"], metadata=m["metadata"])
            for m in resp.matches]
    if use_rerank:
        docs = reranker.compress_documents(docs, query)  # no top_n here
    return docs

In [3]:
if __name__ == "__main__":
    pdf_file = "/Users/rohitbohra/Desktop/assignment/data/Test.pdf"
    
    # Step 1: Index once
    index_semantic_chunks(pdf_file)
    
    # Step 2: Perform a sample retrieval
    question = "What is a tangent to a circle and how does it relate to the radius?"
    results = retrieve_relevant_chunks(question, use_rerank=True)
    
    # Step 3: Display retrieved passages
    for i, doc in enumerate(results, start=1):
        snippet = doc.page_content.replace("\n", " ")
        print(f"\n--- Result {i} ---\n{snippet}...\n")

🔍 Indexing 159 chunks under namespace 'semantic'

--- Result 1 ---
In fact, the wheel moves along a line which is a tangent to the circle representing the wheel. Also, notice that in all positions, the radius through the point of contact with the ground appears to be at right angles to the tangent (see Fig. 10.4). We shall now prove this property of the tangent. Theorem 10.1 :  The tangent at any point of a circle is perpendicular to the radius through the point of contact. P roof :  We are given a circle with centre O and a tangent XY to the circle at a point P.  We need to prove that OP is perpendicular to XY . Fig. 10.4 Fig. 10.3 (ii) Reprint 2025-26  CIRCLES 147 Take a point Q on XY other than P and join OQ (see Fig. 10.5). The point Q must lie outside the circle. (Why? Note that if Q lies inside the circle, XY will become a secant and not a tangent to the circle). Therefore, OQ is longer than the radius OP of the circle. That is, OQ > OP . Since this happens for every point on the

In [4]:
from openai import OpenAI

client = OpenAI()

# Step 2: Perform a sample retrieval
question = "What does Theorem 10.1 say about tangents and radius?"
results = retrieve_relevant_chunks(question, use_rerank=True)

# Step 3: Display retrieved passages
for i, doc in enumerate(results, start=1):
    snippet = doc.page_content.replace("\n", " ")
    print(f"\n--- Result {i} ---\n{snippet}...\n")

# Step 4: Combine context from top-k retrieved documents
context = "\n\n".join([doc.page_content for doc in results])

# Step 5: Construct a RAG-style prompt
rag_prompt = f"""You are a helpful assistant. Use the context below to answer the question as accurately and clearly as possible.

Context:
{context}

Question:
{question}

Answer:"""

# Step 6: Generate answer using OpenAI ChatCompletion
response = client.chat.completions.create(
    model="gpt-3.5-turbo",  # or "gpt-4" if available
    messages=[
        {"role": "system", "content": "You are a helpful assistant. Who has no internal knowledge. Use the context provided to give an answer"},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0
)

# Step 7: Print final answer
generated_answer = response.choices[0].message.content.strip()
print("\n🔍 Final Answer from LLM:\n", generated_answer)



--- Result 1 ---
Theorem 10.1 :  The tangent at any point of a circle is perpendicular to the radius through the point of contact. P roof :  We are given a circle with centre O and a tangent XY to the circle at a point P.  We need to prove that OP is perpendicular to XY . Fig. 10.4 Fig. 10.3 (ii) Reprint 2025-26  CIRCLES 147 Take a point Q on XY other than P and join OQ (see Fig. 10.5). The point Q must lie outside the circle. (Why? Note that if Q lies inside the circle, XY will become a secant and not a tangent to the circle). Therefore, OQ is longer than the radius OP of the circle. That is, OQ > OP . Since this happens for every point on the line XY  except the point P , OP  is the shortest of all the distances of the point O to the points of XY . So OP  is perpendicular to XY . (as shown in Theorem A1.7.) Remarks 1. By theorem above, we can also conclude that at any point on a circle there can be one and only one tangent. 2. The line containing the radius through the point of cont

In [5]:
from openai import OpenAI

client = OpenAI()

# Step 2: Perform a sample retrieval
question = "What does Theorem 10.1 say about tangents and radius?"
results = retrieve_relevant_chunks(question, use_rerank=False)

# Step 3: Display retrieved passages
for i, doc in enumerate(results, start=1):
    snippet = doc.page_content.replace("\n", " ")
    print(f"\n--- Result {i} ---\n{snippet}...\n")

# Step 4: Combine context from top-k retrieved documents
context = "\n\n".join([doc.page_content for doc in results])

# Step 5: Construct a RAG-style prompt
rag_prompt = f"""You are a helpful assistant. Use the context below to answer the question as accurately and clearly as possible.

Context:
{context}

Question:
{question}

Answer:"""

# Step 6: Generate answer using OpenAI ChatCompletion
response = client.chat.completions.create(
    model="gpt-3.5-turbo",  # or "gpt-4" if available
    messages=[
        {"role": "system", "content": "You are a helpful assistant. Who has no internal knowledge. Use the context provided to give an answer"},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0
)

# Step 7: Print final answer
generated_answer = response.choices[0].message.content.strip()
print("\n🔍 Final Answer from LLM:\n", generated_answer)


--- Result 1 ---
(as shown in Theorem A1.7.) Remarks 1. By theorem above, we can also conclude that at any point on a circle there can be one and only one tangent. 2. The line containing the radius through the point of contact is also sometimes called the ‘normal’ to the circle at the point. EXERCISE 10.1 1. How many tangents can a circle have? 2. Fill in the blanks : (i) A tangent to a circle intersects it in   point (s). (ii) A line intersecting a circle in two points is called a  . (iii) A circle can have  parallel tangents at the most. (iv) The common point of a tangent to a circle and the circle is called  . 3. A tangent PQ at a point P of a circle of radius 5 cm meets a line through the centre O at a point Q so that OQ = 12 cm. Length PQ is : (A) 12 cm (B) 13 cm (C) 8.5 cm (D) 119  cm. 4. Draw a circle and two lines parallel to a given line such that one is a tangent and the other, a secant to the circle. 10.3 Number of Tangents from a Point on a Circle To get an idea of the num

In [None]:
 A tangent to a circle is a line that intersects the circle at exactly one point. 
The relationship between a tangent and the radius of a circle is that at the point of contact between the tangent and the circle,
the radius drawn to that point is perpendicular to the tangent line. 
This means that the radius and the tangent line form a right angle at the point of contact.



 A tangent to a circle is a line that touches the circle at exactly one point, known as the point of contact.
The key relationship between a tangent and the radius of a circle is that at the point of contact, 
the tangent is perpendicular to the radius through that point.
This means that the radius and the tangent form a right angle at the point where they meet on the circle.

In [None]:
 The lengths of tangents drawn from an external point to a circle are equal, as stated in Theorem 10.2.
This can be proven by considering a circle with center O, an external point P,
 and two tangents PQ and PR drawn from P to the circle. By joining OP, OQ, and OR,
 it can be shown that the lengths of the tangents PQ and PR are equal. 
 This equality is established by proving that the triangles OQP and ORP are congruent using the RHS
 (Right Angle-Hypotenuse-Side) criterion.



The lengths of tangents drawn from an external point to a circle are equal. 
This can be proven by considering a circle with center O, a point P lying outside the circle, 
and two tangents PQ and PR on the circle from P. 
By joining OP, OQ, and OR, it can be shown that in right triangles OQP and ORP, OQ = OR (radii of the same circle) and
OP = OP (common). Therefore, triangles OQP and ORP are congruent by the RHS (Right-Hypotenuse-Side) criterion, 
leading to the conclusion that PQ = PR (by CPCT - Corresponding Parts of Congruent Triangles are Congruent).






Given a circle with center O and two tangents PQ and PR from an external point P, join OP, OQ, and OR.
 Angles OQP and ORP are right angles (by Theorem 10.1). In right triangles OQP and ORP, OQ = OR (radii), OP = OP (common),
 so triangles OQP and ORP are congruent by RHS congruence. 
Thus, PQ = PR by CPCT.