## Installation and Setup


In [None]:
# Install required packages
%pip install langchain langchain-community langchain-ibm chromadb pypdf gradio python-dotenv ibm-watson-machine-learning -q


In [None]:
# Import required libraries
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ibm import WatsonxEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_ibm import ChatWatsonx
from langchain.prompts import PromptTemplate
import gradio as gr
from typing import List, Tuple

# Load environment variables
load_dotenv()

print("âœ“ All libraries imported successfully")


## Configuration

**Note**: You'll need to set up your watsonx.ai API credentials. Create a `.env` file with:
- `WATSONX_APIKEY=your_api_key`
- `WATSONX_PROJECT_ID=your_project_id`
- Or set them as environment variables


In [None]:
# Configuration
WATSONX_APIKEY = os.getenv("WATSONX_APIKEY", "your_api_key_here")
WATSONX_PROJECT_ID = os.getenv("WATSONX_PROJECT_ID", "your_project_id_here")
WATSONX_ENDPOINT = "https://us-south.ml.cloud.ibm.com"  # Adjust based on your region

# Model configurations
LLM_MODEL = "mistralai/mixtral-8x7b-instruct-v01"
EMBEDDING_MODEL = "sentence-transformers/all-minilm-l6-v2"  # watsonx embedding model

print("âœ“ Configuration loaded")


---
# Task 1: Load Document Using LangChain for Different Sources (10 points)

This task demonstrates loading documents from PDF sources using LangChain's PyPDFLoader.


In [None]:
# Task 1: Load document using LangChain for different sources
# Example: Loading PDF document

def load_pdf_document(pdf_path: str):
    """
    Load a PDF document using LangChain's PyPDFLoader
    
    Args:
        pdf_path: Path to the PDF file (local or URL)
    
    Returns:
        List of Document objects
    """
    # Initialize the PDF loader
    loader = PyPDFLoader(pdf_path)
    
    # Load the document
    documents = loader.load()
    
    print(f"âœ“ Successfully loaded PDF: {pdf_path}")
    print(f"âœ“ Number of pages: {len(documents)}")
    print(f"âœ“ Total characters: {sum(len(doc.page_content) for doc in documents)}")
    
    return documents

# Example usage - Uncomment and provide your PDF path
# pdf_path = "your_document.pdf"  # Replace with your PDF path
# documents = load_pdf_document(pdf_path)

# Display first page content preview
# if documents:
#     print("\nFirst page preview:")
#     print(documents[0].page_content[:500])

print("âœ“ Task 1: PDF loader function created")
print("\nðŸ“¸ Please take a screenshot of this code cell and save it as 'pdf_loader.png'")


---
# Task 2: Apply Text Splitting Techniques (10 points)

This task demonstrates text splitting to enhance model responsiveness.


In [None]:
# Task 2: Apply text splitting techniques to enhance model responsiveness

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunks using RecursiveCharacterTextSplitter
    
    Args:
        documents: List of Document objects
        chunk_size: Maximum size of each text chunk
        chunk_overlap: Number of characters to overlap between chunks
    
    Returns:
        List of split Document objects
    """
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    # Split the documents
    split_docs = text_splitter.split_documents(documents)
    
    print(f"âœ“ Original documents: {len(documents)}")
    print(f"âœ“ Split into chunks: {len(split_docs)}")
    print(f"âœ“ Average chunk size: {sum(len(doc.page_content) for doc in split_docs) // len(split_docs)} characters")
    print(f"âœ“ Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
    
    return split_docs

# Example usage
# split_documents_list = split_documents(documents, chunk_size=1000, chunk_overlap=200)

print("âœ“ Task 2: Text splitter function created")
print("\nðŸ“¸ Please take a screenshot of this code cell and save it as 'code_splitter.png'")


---
# Task 3: Embed Documents (10 points)

This task demonstrates embedding documents using watsonx's embedding model.


In [None]:
# Task 3: Embed documents using watsonx's embedding model

def create_embeddings(api_key=None, project_id=None, endpoint=None):
    """
    Create embeddings using watsonx embedding model
    
    Args:
        api_key: Watsonx API key
        project_id: Watsonx project ID
        endpoint: Watsonx API endpoint
    
    Returns:
        WatsonxEmbeddings object
    """
    # Initialize watsonx embeddings
    # Note: Adjust parameters based on your watsonx.ai setup
    embeddings = WatsonxEmbeddings(
        model_id="sentence-transformers/all-minilm-l6-v2",  # or use watsonx embedding model
        apikey=api_key or WATSONX_APIKEY,
        project_id=project_id or WATSONX_PROJECT_ID,
        url=endpoint or WATSONX_ENDPOINT
    )
    
    print("âœ“ Embeddings model initialized successfully")
    print(f"âœ“ Model: sentence-transformers/all-minilm-l6-v2")
    
    return embeddings

# Alternative: If watsonx embeddings are not available, you can use HuggingFace embeddings
# from langchain_community.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize embeddings
# embeddings = create_embeddings()

print("âœ“ Task 3: Embedding function created")
print("\nðŸ“¸ Please take a screenshot of this code cell and save it as 'embedding.png'")


---
# Task 4: Create and Configure Vector Database (10 points)

This task demonstrates creating a Chroma vector database to store document embeddings.


In [None]:
# Task 4: Create and configure Chroma vector database to store embeddings

def create_vector_database(split_documents, embeddings, persist_directory="./chroma_db"):
    """
    Create a Chroma vector database from split documents and embeddings
    
    Args:
        split_documents: List of split Document objects
        embeddings: Embeddings model
        persist_directory: Directory to persist the vector database
    
    Returns:
        Chroma vector store object
    """
    # Create Chroma vector database
    vectorstore = Chroma.from_documents(
        documents=split_documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    
    # Persist the vector database
    vectorstore.persist()
    
    print(f"âœ“ Vector database created successfully")
    print(f"âœ“ Number of documents stored: {len(split_documents)}")
    print(f"âœ“ Persist directory: {persist_directory}")
    
    return vectorstore

# Example usage
# vectorstore = create_vector_database(split_documents_list, embeddings)

print("âœ“ Task 4: Vector database creation function created")
print("\nðŸ“¸ Please take a screenshot of this code cell and save it as 'vectordb.png'")


---
# Task 5: Develop a Retriever (10 points)

This task demonstrates developing a retriever to fetch document segments based on queries.


In [None]:
# Task 5: Develop a retriever to fetch document segments based on queries

def create_retriever(vectorstore, k=4, search_type="similarity"):
    """
    Create a retriever from the vector database
    
    Args:
        vectorstore: Chroma vector store object
        k: Number of documents to retrieve
        search_type: Type of search ("similarity" or "mmr")
    
    Returns:
        Retriever object
    """
    # Create retriever from vectorstore
    retriever = vectorstore.as_retriever(
        search_type=search_type,
        search_kwargs={"k": k}
    )
    
    print(f"âœ“ Retriever created successfully")
    print(f"âœ“ Retrieval type: {search_type}")
    print(f"âœ“ Number of documents to retrieve: {k}")
    
    return retriever

# Example usage
# retriever = create_retriever(vectorstore, k=4)

# Test retrieval with a sample query
# query = "What is the main topic of this paper?"
# retrieved_docs = retriever.get_relevant_documents(query)
# print(f"\nRetrieved {len(retrieved_docs)} documents for query: '{query}'")

print("âœ“ Task 5: Retriever function created")
print("\nðŸ“¸ Please take a screenshot of this code cell and save it as 'retriever.png'")


---
# Task 6: Construct QA Bot with Gradio Interface (10 points)

This task demonstrates constructing a QA Bot that leverages LangChain and LLM to answer questions from loaded documents.


In [None]:
# Task 6: Construct a QA Bot that leverages LangChain and LLM to answer questions

def create_qa_chain(retriever, llm_model=LLM_MODEL, api_key=None, project_id=None, endpoint=None):
    """
    Create a QA chain using RetrievalQA
    
    Args:
        retriever: Retriever object
        llm_model: LLM model identifier
        api_key: Watsonx API key
        project_id: Watsonx project ID
        endpoint: Watsonx API endpoint
    
    Returns:
        RetrievalQA chain object
    """
    # Initialize the LLM
    llm = ChatWatsonx(
        model=llm_model,
        apikey=api_key or WATSONX_APIKEY,
        project_id=project_id or WATSONX_PROJECT_ID,
        url=endpoint or WATSONX_ENDPOINT,
        params={
            "decoding_method": "sample",
            "max_new_tokens": 1000,
            "temperature": 0.7,
            "top_p": 0.9
        }
    )
    
    # Create prompt template
    prompt_template = """Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    
    Context: {context}
    
    Question: {question}
    
    Answer:"""
    
    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )
    
    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )
    
    print(f"âœ“ QA chain created successfully")
    print(f"âœ“ LLM model: {llm_model}")
    
    return qa_chain

def qa_function(question, qa_chain):
    """
    Answer a question using the QA chain
    
    Args:
        question: User's question
        qa_chain: QA chain object
    
    Returns:
        Answer string
    """
    try:
        result = qa_chain.invoke({"query": question})
        answer = result["result"]
        sources = result.get("source_documents", [])
        
        # Format the response
        response = f"**Answer:**\n{answer}\n\n"
        
        if sources:
            response += f"**Sources:** {len(sources)} document(s) retrieved\n"
        
        return response
    except Exception as e:
        return f"Error: {str(e)}"

print("âœ“ Task 6: QA chain functions created")
print("\nðŸ“¸ Please take a screenshot of this code cell and save it as 'QA_bot.png'")


In [None]:
# Create Gradio interface for the QA Bot

def create_gradio_interface(qa_chain):
    """
    Create a Gradio interface for the QA bot
    
    Args:
        qa_chain: QA chain object
    
    Returns:
        Gradio interface object
    """
    def chat_interface(question, history):
        """Chat interface function for Gradio"""
        if not question.strip():
            return "Please enter a question."
        
        try:
            result = qa_chain.invoke({"query": question})
            answer = result["result"]
            return answer
        except Exception as e:
            return f"Error processing your question: {str(e)}"
    
    # Create Gradio interface
    interface = gr.Interface(
        fn=chat_interface,
        inputs=gr.Textbox(
            label="Question",
            placeholder="Ask a question about the document...",
            lines=3
        ),
        outputs=gr.Textbox(
            label="Answer",
            lines=10
        ),
        title="Quest Analytics - AI RAG Assistant",
        description="Ask questions about the loaded documents. Upload a PDF and ask questions!",
        theme="default"
    )
    
    return interface

print("âœ“ Gradio interface function created")


## Complete Implementation with File Upload

This cell contains the complete implementation that allows uploading PDFs through the Gradio interface.


In [None]:
# Complete implementation with file upload capability in Gradio
# This allows users to upload PDFs directly through the interface

def create_full_gradio_interface():
    """Create a complete Gradio interface with file upload"""
    
    # Store QA chain globally (in a real app, you'd use session state)
    qa_chain_store = {"chain": None, "vectorstore": None}
    
    def upload_and_process(pdf_file):
        """Process uploaded PDF"""
        if pdf_file is None:
            return "Please upload a PDF file.", ""
        
        try:
            # Step 1: Load PDF
            documents = load_pdf_document(pdf_file.name)
            
            # Step 2: Split documents
            split_docs = split_documents(documents)
            
            # Step 3: Create embeddings
            embeddings = create_embeddings()
            
            # Step 4: Create vector database
            vectorstore = create_vector_database(split_docs, embeddings, persist_directory="./temp_chroma_db")
            
            # Step 5: Create retriever
            retriever = create_retriever(vectorstore, k=4)
            
            # Step 6: Create QA chain
            qa_chain = create_qa_chain(retriever)
            
            # Store for later use
            qa_chain_store["chain"] = qa_chain
            qa_chain_store["vectorstore"] = vectorstore
            
            return f"âœ“ PDF processed successfully! {len(documents)} pages loaded, {len(split_docs)} chunks created.", ""
        except Exception as e:
            return f"Error processing PDF: {str(e)}", ""
    
    def ask_question(question):
        """Ask a question about the uploaded document"""
        if qa_chain_store["chain"] is None:
            return "Please upload and process a PDF file first."
        
        if not question.strip():
            return "Please enter a question."
        
        try:
            result = qa_chain_store["chain"].invoke({"query": question})
            return result["result"]
        except Exception as e:
            return f"Error: {str(e)}"
    
    with gr.Blocks(title="Quest Analytics - AI RAG Assistant") as demo:
        gr.Markdown("# Quest Analytics - AI RAG Assistant")
        gr.Markdown("Upload a PDF document and ask questions about it.")
        
        with gr.Row():
            with gr.Column():
                pdf_upload = gr.File(
                    label="Upload PDF Document",
                    file_types=[".pdf"]
                )
                upload_btn = gr.Button("Process PDF", variant="primary")
                upload_status = gr.Textbox(label="Status", interactive=False)
            
            with gr.Column():
                question_input = gr.Textbox(
                    label="Ask a Question",
                    placeholder="What is this paper talking about?",
                    lines=3
                )
                ask_btn = gr.Button("Ask Question", variant="primary")
                answer_output = gr.Textbox(
                    label="Answer",
                    lines=10,
                    interactive=False
                )
        
        upload_btn.click(
            fn=upload_and_process,
            inputs=pdf_upload,
            outputs=[upload_status, answer_output]
        )
        
        ask_btn.click(
            fn=ask_question,
            inputs=question_input,
            outputs=answer_output
        )
        
        # Allow Enter key to submit question
        question_input.submit(
            fn=ask_question,
            inputs=question_input,
            outputs=answer_output
        )
    
    return demo

print("âœ“ Complete Gradio interface with file upload created")
print("\nTo launch the interface, run:")
print("demo = create_full_gradio_interface()")
print("demo.launch(share=True)")


## Complete Workflow - Run All Tasks Together

Run this cell to execute the complete workflow with a PDF file.


In [None]:
# Complete workflow: Run all tasks together
# Uncomment and modify the paths/credentials as needed

# Step 1: Load PDF document
# pdf_path = "your_document.pdf"  # Replace with your PDF path
# documents = load_pdf_document(pdf_path)

# Step 2: Split documents
# split_documents_list = split_documents(documents, chunk_size=1000, chunk_overlap=200)

# Step 3: Create embeddings
# embeddings = create_embeddings()

# Step 4: Create vector database
# vectorstore = create_vector_database(split_documents_list, embeddings)

# Step 5: Create retriever
# retriever = create_retriever(vectorstore, k=4)

# Step 6: Create QA chain
# qa_chain = create_qa_chain(retriever)

# Step 7: Test with a query
# query = "What is this paper talking about?"
# result = qa_chain.invoke({"query": query})
# print("Question:", query)
# print("Answer:", result["result"])

print("âœ“ Complete workflow code prepared")
print("\nUncomment the code above and provide your PDF path to run the complete workflow")


## Launch the Gradio Interface

Run this cell to launch the interactive Gradio interface for Task 6.


In [None]:
# Launch the Gradio interface
# Uncomment to run

# demo = create_full_gradio_interface()
# demo.launch(share=True, server_name="0.0.0.0", server_port=7860)

print("âœ“ Ready to launch interface")
print("\nðŸ“¸ For Task 6, take a screenshot of the Gradio interface showing:")
print("  1. A PDF file uploaded")
print("  2. The query 'What is this paper talking about?' entered")
print("  3. The answer displayed")
print("\nSave the screenshot as 'QA_bot.png'")
