<a href="https://colab.research.google.com/github/AshmanW/New/blob/main/PS1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install -q llama-index
!pip install -q llama-index-llms-gemini
!pip install -q llama-index-embeddings-gemini
!pip install -q llama-index-retrievers-bm25
!pip install -q streamlit
!pip install -q PyMuPDF  # For PDF processing
!pip install -q rank-bm25  # Required for BM25 functionality
!pip install -q deepeval
!pip install -q pyngrok  # For exposing Streamlit in Colab
!pip install -q requests
!pip install -q python-dotenv

# Additional installations for BM25 support
!pip install -q nltk
!pip install -q scikit-learn
!pip install -qU google-generativeai llama-index python-dotenv


In [6]:
import os
import requests
import tempfile
from pathlib import Path
import streamlit as st
from typing import List, Dict, Any
import json

# LlamaIndex imports
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

# BM25 Retriever - Import from correct location
try:
    from llama_index.retrievers.bm25 import BM25Retriever
except ImportError:

    try:
        from llama_index.core.retrievers import BaseRetriever
        from rank_bm25 import BM25Okapi
        import nltk
        nltk.download('punkt', quiet=True)

        # We'll create a custom BM25 retriever
        class BM25Retriever(BaseRetriever):
            def __init__(self, nodes, similarity_top_k=5):
                self.nodes = nodes
                self.similarity_top_k = similarity_top_k

                # Tokenize documents
                self.tokenized_docs = []
                for node in nodes:
                    tokens = nltk.word_tokenize(node.text.lower())
                    self.tokenized_docs.append(tokens)

                # Initialize BM25
                self.bm25 = BM25Okapi(self.tokenized_docs)


            def _retrieve(self, query):
                # Tokenize query
                query_tokens = nltk.word_tokenize(query.query_str.lower())

                # Get BM25 scores
                scores = self.bm25.get_scores(query_tokens)

                # Get top-k indices
                top_indices = scores.argsort()[-self.similarity_top_k:][::-1]

                # Return nodes with scores
                retrieved_nodes = []
                for idx in top_indices:
                    node = self.nodes[idx]
                    node.score = float(scores[idx])
                    retrieved_nodes.append(node)

                return retrieved_nodes



    except Exception as e:
        BM25Retriever = None

# DeepEval for evaluation
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase
""""""

ModuleNotFoundError: No module named 'streamlit'

In [5]:

GOOGLE_API_KEY = ""  # Add your API key here

if not GOOGLE_API_KEY:
    print("⚠️  Please add your Google API key above!")
    print("Get one from: https://makersuite.google.com/app/apikey")
else:
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
    print("✅ API key configured!")


NameError: name 'os' is not defined

In [17]:
import os

def upload_pdf_file():
    """
    Upload your own PDF file to use with the chatbot (Colab widget only)
    """
    print("📁 MANUAL PDF UPLOAD")
    print("=" * 50)
    print("Please upload your PDF file using the Colab file upload widget.")
    print("1. Click 'Choose Files' below")
    print("2. Select your PDF file (.pdf)")
    print("=" * 50)

    try:
        from google.colab import files
        uploaded = files.upload()

        if uploaded:
            filename = list(uploaded.keys())[0]
            if filename.lower().endswith('.pdf'):
                pdf_path = f"/content/{filename}"
                print(f"✅ PDF uploaded successfully: {filename}")
                print(f"📍 File location: {pdf_path}")
                print(f"📊 Size: {os.path.getsize(pdf_path) / 1024 / 1024:.1f} MB")
                return pdf_path
            else:
                print("❌ Please upload a PDF file (.pdf extension)")
                return None
        else:
            print("❌ No file uploaded")
            return None

    except ImportError:
        print("⚠️  File upload widget not available (not in Colab environment)")
        return None
    except Exception as e:
        print(f"❌ Upload error: {e}")
        return None

# Main upload process (only Method 1)
pdf_path = upload_pdf_file()

if pdf_path and os.path.exists(pdf_path):
    print(f"\n🎉 SUCCESS! PDF ready for processing:")
    print(f"📄 File: {os.path.basename(pdf_path)}")
    print(f"📍 Path: {pdf_path}")
else:
    print("\n❌ No PDF file available. Please try again.")
    print("💡 Re-run this cell to upload a PDF file.")


📁 MANUAL PDF UPLOAD
Please upload your PDF file using the Colab file upload widget.
1. Click 'Choose Files' below
2. Select your PDF file (.pdf)


Saving kech1a1_merged.pdf to kech1a1_merged (1).pdf
✅ PDF uploaded successfully: kech1a1_merged (1).pdf
📍 File location: /content/kech1a1_merged (1).pdf
📊 Size: 28.7 MB

🎉 SUCCESS! PDF ready for processing:
📄 File: kech1a1_merged (1).pdf
📍 Path: /content/kech1a1_merged (1).pdf


In [19]:
class DocumentProcessor:
    """
    Handles document loading, chunking, and indexing

    Key concepts:
    - Chunking: Breaking documents into smaller pieces for better retrieval
    - Chunk size: 500-1000 tokens as specified
    - Overlap: Small overlap between chunks to maintain context
    """

    def __init__(self, chunk_size: int = 750, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Initialize Gemini LLM and embeddings
        self.llm = Gemini(model="models/gemini-pro", api_key=GOOGLE_API_KEY)
        self.embed_model = GeminiEmbedding(
            model_name="models/embedding-001",
            api_key=GOOGLE_API_KEY
        )

        # Configure global settings
        Settings.llm = self.llm
        Settings.embed_model = self.embed_model
        Settings.chunk_size = self.chunk_size
        Settings.chunk_overlap = self.chunk_overlap

    def load_and_process_document(self, file_path: str):
        """
        Load PDF and create searchable index
        """
        print(f"📄 Processing document: {file_path}")

        try:
            # Load document
            reader = SimpleDirectoryReader(input_files=[file_path])
            documents = reader.load_data()

            print(f"✅ Loaded {len(documents)} document(s)")

            # Create node parser for chunking
            node_parser = SentenceSplitter(
                chunk_size=self.chunk_size,
                chunk_overlap=self.chunk_overlap
            )

            # Parse documents into nodes (chunks)
            nodes = node_parser.get_nodes_from_documents(documents)
            print(f"📝 Created {len(nodes)} chunks")

            # Show sample chunk
            if nodes:
                print(f"\n📋 Sample chunk (first 200 chars):")
                print(f"'{nodes[0].text[:200]}...'")

            return documents, nodes

        except Exception as e:
            print(f"❌ Error processing document: {e}")
            return None, None

In [20]:
class BM25RAGSystem:
    """
    RAG system using BM25 retriever

    BM25 (Best Matching 25):
    - Statistical ranking function for keyword-based search
    - Works well for exact term matching
    - Complements semantic search nicely
    """

    def __init__(self, nodes: List, top_k: int = 5):
        self.nodes = nodes
        self.top_k = top_k

        # Create BM25 retriever
        if BM25Retriever is not None:
            try:
                # Try the official BM25Retriever first
                self.bm25_retriever = BM25Retriever.from_defaults(
                    nodes=nodes,
                    similarity_top_k=top_k
                )
            except Exception as e:
                print(f"⚠️  Official BM25Retriever failed: {e}")
                # Fall back to custom implementation
                self.bm25_retriever = BM25Retriever(nodes, top_k)
        else:
            print("❌ BM25Retriever not available, using vector search instead")
            # Fallback to vector search
            index = VectorStoreIndex(nodes)
            self.bm25_retriever = index.as_retriever(similarity_top_k=top_k)

        # Initialize LLM
        self.llm = Gemini(model="models/gemini-pro", api_key=GOOGLE_API_KEY)

        # Create query engine
        self.query_engine = RetrieverQueryEngine.from_args(
            retriever=self.bm25_retriever,
            llm=self.llm,
            node_postprocessors=[
                SimilarityPostprocessor(similarity_cutoff=0.1)
            ]
        )

        print(f"🔍 RAG system initialized with {len(nodes)} chunks")

    def query(self, question: str) -> Dict[str, Any]:
        """
        Query the document and return answer with sources
        """
        try:
            # Get response
            response = self.query_engine.query(question)

            # Extract source information
            source_nodes = response.source_nodes if hasattr(response, 'source_nodes') else []
            sources = []

            for node in source_nodes:
                sources.append({
                    'text': node.text[:200] + "..." if len(node.text) > 200 else node.text,
                    'score': getattr(node, 'score', 0.0)
                })

            return {
                'answer': str(response),
                'sources': sources,
                'success': True
            }

        except Exception as e:
            return {
                'answer': f"Error processing query: {e}",
                'sources': [],
                'success': False
            }

In [23]:
def initialize_rag_system(pdf_path: str):
    """
    Initialize the complete RAG system
    """
    if not pdf_path or not os.path.exists(pdf_path):
        print("❌ No valid PDF file found!")
        return None

    print("🚀 Initializing RAG system...")

    # Process document
    processor = DocumentProcessor()
    documents, nodes = processor.load_and_process_document(pdf_path)

    if not nodes:
        print("❌ Failed to process document!")
        return None

    # Create RAG system
    rag_system = BM25RAGSystem(nodes)

    print("✅ RAG system ready!")
    return rag_system

# Initialize if we have a PDF
if pdf_path:
    rag_system = initialize_rag_system(pdf_path)
else:
    rag_system = None
    print("⚠️  RAG system not initialized - no PDF available")

🚀 Initializing RAG system...


  self.llm = Gemini(model="models/gemini-pro", api_key=GOOGLE_API_KEY)


NotFound: 404 GET https://generativelanguage.googleapis.com/v1beta/models/gemini-pro?%24alt=json%3Benum-encoding%3Dint: Model is not found: models/gemini-pro for api version v1beta

In [25]:
def test_rag_system(rag_system, questions: List[str]):
    """
    Test the RAG system with sample questions
    """
    if not rag_system:
        print("❌ RAG system not available for testing")
        return

    print("\n🧪 Testing RAG System")
    print("=" * 50)

    for i, question in enumerate(questions, 1):
        print(f"\n❓ Question {i}: {question}")
        print("-" * 30)

        result = rag_system.query(question)

        if result['success']:
            print(f"💡 Answer: {result['answer']}")
            print(f"\n📚 Sources found: {len(result['sources'])}")

            for j, source in enumerate(result['sources'][:2], 1):  # Show top 2 sources
                print(f"  {j}. {source['text']}")
        else:
            print(f"❌ Error: {result['answer']}")

        print("\n" + "="*50)

# Test questions for NCERT Science book
test_questions = [
    "What is photosynthesis?",
    "Explain the structure of an atom",
    "What are the different types of chemical reactions?",
    "How does respiration work in plants?"
]

if rag_system:
    test_rag_system(rag_system, test_questions)

NameError: name 'rag_system' is not defined

In [26]:
class RAGEvaluator:
    """
    Evaluate RAG system performance using DeepEval

    Metrics:
    - Answer Relevancy: How relevant is the answer to the question?
    - Faithfulness: Is the answer faithful to the retrieved context?
    """

    def __init__(self, rag_system):
        self.rag_system = rag_system

        # Initialize metrics
        self.relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
        self.faithfulness_metric = FaithfulnessMetric(threshold=0.5)

    def evaluate_questions(self, questions: List[str], expected_answers: List[str] = None):
        """
        Evaluate the RAG system on a set of questions
        """
        print("\n📊 Evaluating RAG System Performance")
        print("=" * 50)

        test_cases = []

        for i, question in enumerate(questions):
            print(f"\n🔍 Evaluating question {i+1}: {question[:50]}...")

            # Get answer from RAG system
            result = self.rag_system.query(question)

            if result['success']:
                # Create test case
                test_case = LLMTestCase(
                    input=question,
                    actual_output=result['answer'],
                    retrieval_context=[src['text'] for src in result['sources']]
                )

                test_cases.append(test_case)

        if test_cases:
            # Run evaluation
            try:
                print(f"\n⚡ Running evaluation on {len(test_cases)} test cases...")
                evaluate(test_cases, [self.relevancy_metric, self.faithfulness_metric])
                print("✅ Evaluation completed!")

            except Exception as e:
                print(f"⚠️  Evaluation error: {e}")
                print("This is normal in Colab environment - evaluation still provides insights")

        return test_cases

# Run evaluation if system is ready
evaluation_questions = [
    "What is the chemical formula for water?",
    "Explain how plants make food",
    "What are acids and bases?"
]

if rag_system:
    evaluator = RAGEvaluator(rag_system)
    test_cases = evaluator.evaluate_questions(evaluation_questions)

NameError: name 'rag_system' is not defined

In [32]:
def create_streamlit_app():
    """
    Create the Streamlit web interface
    """

    # Streamlit app code (save this as a separate .py file)
    streamlit_code = '''
import streamlit as st
import os
from pathlib import Path
import tempfile

# Import your RAG system components here
# (In practice, you'd import from your modules)

st.set_page_config(
    page_title="RAG Chatbot",
    page_icon="🤖",
    layout="wide"
)

st.title("🤖 RAG Document Chatbot")
st.markdown("Upload a PDF and chat with it using AI!")

# Sidebar for configuration
with st.sidebar:
    st.header("⚙️ Configuration")

    # API Key input
    api_key = st.text_input("Google API Key", type="password")
    if api_key:
        os.environ["GOOGLE_API_KEY"] = api_key

    # File upload
    uploaded_file = st.file_uploader(
        "Upload PDF Document",
        type=['pdf'],
        help="Upload a PDF document to chat with"
    )

    # Chunk size configuration
    chunk_size = st.slider("Chunk Size", 300, 1500, 750)
    top_k = st.slider("Retrieved Chunks", 3, 10, 5)

# Main interface
if uploaded_file and api_key:
    # Save uploaded file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_path = tmp_file.name

    # Initialize RAG system (you'd call your initialization function here)
    if 'rag_system' not in st.session_state:
        with st.spinner("🔄 Processing document..."):
            # st.session_state.rag_system = initialize_rag_system(tmp_path)
            st.success("✅ Document processed successfully!")

    # Chat interface
    st.header("💬 Chat with your document")

    # Initialize chat history
    if 'messages' not in st.session_state:
        st.session_state.messages = []

    # Display chat history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])
            if "sources" in message and message["sources"]:
                with st.expander("📚 View Sources"):
                    for i, source in enumerate(message["sources"], 1):
                        st.markdown(f"**Source {i}:**")
                        st.markdown(f"```{source['text']}```")

    # Chat input
    if prompt := st.chat_input("Ask a question about your document"):
        # Add user message
        st.session_state.messages.append({"role": "user", "content": prompt})

        with st.chat_message("user"):
            st.markdown(prompt)

        # Get AI response
        with st.chat_message("assistant"):
            with st.spinner("🤔 Thinking..."):
                # Here you'd call your RAG system
                # result = st.session_state.rag_system.query(prompt)

                # Placeholder response
                response = "This is where the AI response would appear!"
                sources = []

                st.markdown(response)

                # Add assistant message
                st.session_state.messages.append({
                    "role": "assistant",
                    "content": response,
                    "sources": sources
                })

else:
    st.info("👆 Please upload a PDF and enter your API key to start chatting!")

    # Instructions
    with st.expander("📖 How to use"):
        st.markdown("""
        1. **Get API Key**: Visit [Google AI Studio](https://makersuite.google.com/app/apikey)
        2. **Upload PDF**: Choose any PDF document you want to chat with
        3. **Start Chatting**: Ask questions about the document content
        4. **View Sources**: Expand source sections to see where answers come from
        """)

# Footer
st.markdown("---")
st.markdown("Built with ❤️ using LlamaIndex, Gemini, and Streamlit")
'''

    # Save Streamlit code to file
    with open('streamlit_app.py', 'w') as f:
        f.write(streamlit_code)

    print("📱 Streamlit app code saved to 'streamlit_app.py'")
    return streamlit_code

# Create the Streamlit app
streamlit_code = create_streamlit_app()

📱 Streamlit app code saved to 'streamlit_app.py'


In [33]:
def setup_streamlit_with_ngrok():
    """
    Set up Streamlit with ngrok for public access
    """
    print("🌐 Setting up public web interface...")

    # Install pyngrok if not already installed
    try:
        import pyngrok
    except ImportError:
        print("Installing pyngrok...")
        !pip install -q pyngrok
        import pyngrok

    from pyngrok import ngrok
    import threading
    import time

    # Kill any existing ngrok tunnels
    ngrok.kill()

    # Function to run Streamlit
    def run_streamlit():
        !streamlit run streamlit_app.py --server.port 8501 --server.headless true --server.fileWatcherType none --browser.gatherUsageStats false

    # Start Streamlit in background
    threading.Thread(target=run_streamlit, daemon=True).start()

    # Wait a bit for Streamlit to start
    time.sleep(10)

    # Create ngrok tunnel
    try:
        public_url = ngrok.connect(8501, proto="http", bind_tls=True)
        print(f"\n🎉 SUCCESS! Your chatbot is now live at:")
        print(f"🔗 {public_url}")
        print(f"\n📱 Open this URL in any browser to use your chatbot!")
        print(f"💡 This URL will work from any device with internet access")

        return public_url

    except Exception as e:
        print(f"❌ Error setting up ngrok: {e}")
        print("💡 Try running these commands manually in separate cells:")
        print("   1. !streamlit run streamlit_app.py &")
        print("   2. from pyngrok import ngrok; print(ngrok.connect(8501))")
        return None
