In [1]:
# Cell 1: Setup
import sys
import os
import logging

# 1. Add the current folder to Python path so we can find 'src'
# (This fixes "ModuleNotFoundError: No module named src")
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

# 2. Import your existing RAG services
from src.db import q_client, setup_collection, get_embedding
from src.services.ingestion import ingest_source
from src.services.retrieval import query_knowledge_base
from src.config import COLLECTION_NAME

# 3. Initialize Database
setup_collection()

# 4. Check Connection
try:
    info = q_client.get_collection(COLLECTION_NAME)
    print(f"‚úÖ SUCCESSFULLY CONNECTED!")
    print(f"   Collection Name: {COLLECTION_NAME}")
    print(f"   Current Chunk Count: {info.points_count}")
    print(f"   Vector Size: {info.config.params.vectors.size}")
except Exception as e:
    print(f"‚ùå Connection Failed: {e}")

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ SUCCESSFULLY CONNECTED!
   Collection Name: internal_knowledge_base
   Current Chunk Count: 67
   Vector Size: 1536


In [5]:
# Cell 2: Ingest Sources
# Add URLs or Local Paths (use r"..." for Windows paths)
sources = [
    # Example 1: Your local resume (Uncomment and edit path)
    # r"C:\Users\shriv\Downloads\your_resume.pdf",
    
    # Example 2: The "Attention Is All You Need" paper
    "https://arxiv.org/pdf/1706.03762",
    
    # Example 3: Small test file
    "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
]

print("üöÄ Starting Ingestion...\n")

for source in sources:
    print(f"üîπ Processing: {source}")
    try:
        # Calls the function from your src/services/ingestion.py
        result = ingest_source(source)
        print(f"   ‚úÖ {result}")
    except Exception as e:
        print(f"   ‚ùå FAILED: {str(e)}")

# Final Status Check
count = q_client.count(collection_name=COLLECTION_NAME).count
print(f"\nüìä Total Documents in DB: {count}")

üöÄ Starting Ingestion...

üîπ Processing: https://arxiv.org/pdf/1706.03762


[INFO] Processing: https://arxiv.org/pdf/1706.03762
Token indices sequence length is longer than the specified maximum sequence length for this model (968 > 512). Running this sequence through the model will result in indexing errors
[INFO] Embedding 67 chunks...
[SUCCESS] Ingested 67 chunks.
[INFO] Processing: https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf


   ‚úÖ Successfully ingested 67 chunks from https://arxiv.org/pdf/1706.03762
üîπ Processing: https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf
   ‚úÖ No valid text chunks found.

üìä Total Documents in DB: 67


[INFO] Embedding 0 chunks...


In [2]:
# Cell 3: Ask Questions
from IPython.display import display, Markdown

# --- ENTER YOUR QUESTION HERE ---
query = "What is the main topic of the paper?" 
# --------------------------------

print(f"üîé Searching for: '{query}'...\n")

# Calls the function from src/services/retrieval.py
response = query_knowledge_base(query)

# Display formatted result
display(Markdown(f"### ü§ñ Answer:\n{response}"))

üîé Searching for: 'What is the main topic of the paper?'...



### ü§ñ Answer:
Error: Qdrant Client version mismatch. Try running 'pip install --upgrade qdrant-client'

In [None]:
# Cell 4: DANGER ZONE - Clear Database
from qdrant_client.http import models

confirm = input("Type 'DELETE' to wipe the database: ")

if confirm == "DELETE":
    q_client.delete(
        collection_name=COLLECTION_NAME,
        points_selector=models.FilterSelector(
            filter=models.Filter() # Empty filter selects everything
        )
    )
    print("üóëÔ∏è Database wiped clean. Count is now 0.")
else:
    print("‚ùå Operation cancelled.")