1. Setting Up the Document Processing Pipeline
In this step, we will load documents (PDFs, text files, and web pages), chunk the text, clean it, and prepare it for embedding.

Import Required Libraries

In [1]:
!pip install openai
!pip install chromadb
!pip install PyMuPDF
!pip install requests
!pip install beautifulsoup4
!pip install transformers
!pip install gradio
!pip install nltk



In [2]:
pip install openai chromadb PyMuPDF requests beautifulsoup4 transformers gradio nltk


Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anaha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import openai
import chromadb
import fitz  # PyMuPDF
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModel
import gradio as gr
import re
import nltk

nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anaha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Document Loaders
Let's implement loaders for different document types (PDF, web scraping, and text files).

In [5]:
# Load PDF
def load_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Load from URL (Web scraping)
def load_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        print(f"Error loading URL: {str(e)}")
        return ""

# Load text files
def load_text(file_path):
    with open(file_path, 'r') as file:
        return file.read()


Text Chunking
To manage long texts, we split documents into chunks with overlap to preserve context.

In [6]:
def chunk_text(text, chunk_size=500, overlap=50):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        current_chunk.append(sentence)
        current_length += len(sentence)

        if current_length >= chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = current_chunk[-overlap:]  # overlap handling
            current_length = sum(len(sent) for sent in current_chunk)

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


Text Cleaning


In [7]:
def clean_text(text):
    # Remove extra whitespace, newlines, and special characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()


# 2. Embedding System
This part uses pre-trained embeddings (OpenAI or transformer-based embeddings like MPNet or BERT). Here, we use transformers for a local embedding model and OpenAI embeddings.

Load the Embedding Model

In [8]:
# Using sentence transformers or OpenAI embedding models
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to embed text using transformers model
import torch

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()


# Vector Database Setup
We’ll use chromadb as the vector database for storing and retrieving document embeddings.

In [9]:
# Initialize Chroma client
client = chromadb.Client()
try:
    collection = client.create_collection("financial_docs")
except chromadb.errors.UniqueConstraintError:
    # Retrieve if already exists
    collection = client.get_collection("financial_docs")

# Add documents to Chroma
def add_to_chroma(text, doc_id):
    embedding = embed_text(text)
    collection.add(
        documents=[text],
        embeddings=[embedding],
        ids=[doc_id]
    )


# 3. Retrieval and Chatbot Interface

Finally, we will set up a Gradio interface that allows users to upload documents, query them, and receive responses generated from a RAG-based model.

# Retrieval System Code
Import Libraries

In [10]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


# Query Processing
- Generate Query Embedding: Converts the user’s query into an embedding for
similarity comparisons.
- Query Expansion (Optional): Expands the query by adding relevant terms to increase search robustness.

In [11]:
# Function to clean and embed the query
def generate_query_embedding(query):
    cleaned_query = clean_text(query)
    return embed_text(cleaned_query)  # Use your embedding model here

# Optional query expansion
def expand_query(query):
    expanded_terms = ["investment", "stock", "market"]  # Example expansions
    expanded_query = query + " " + " ".join(expanded_terms)
    return expanded_query


# Similarity Search
Uses cosine similarity (or optionally other distance metrics) to find documents similar to the query based on embeddings.

In [12]:
# Function to calculate similarity scores
def similarity_search(query_embedding, document_embeddings, method="cosine"):
    if method == "cosine":
        similarities = cosine_similarity([query_embedding], document_embeddings)[0]
    elif method == "euclidean":
        similarities = -euclidean_distances([query_embedding], document_embeddings)[0]
    elif method == "dot":
        similarities = np.dot(document_embeddings, query_embedding)
    else:
        raise ValueError("Unsupported similarity method.")

    return similarities


# Filtering & Ranking
- Metadata Filtering: Filters results based on metadata (e.g., type, date).
- Ranking: Sorts results by similarity score to return the top-k relevant documents

In [13]:
# Filter based on metadata (e.g., type, author)
def filter_documents(documents, metadata, filter_criteria):
    filtered_docs = []
    for doc, meta in zip(documents, metadata):
        if all(meta.get(k) == v for k, v in filter_criteria.items()):
            filtered_docs.append(doc)
    return filtered_docs

# Rank documents based on similarity score
def rank_documents(documents, similarities, top_k=3):
    ranked_indices = np.argsort(similarities)[::-1][:top_k]
    ranked_docs = [documents[i] for i in ranked_indices]
    ranked_similarities = [similarities[i] for i in ranked_indices]
    return ranked_docs, ranked_similarities


# Retrieval and Ranking Pipeline



In [14]:
# Main retrieval function
def retrieve_and_rank_documents(query, top_k=3, similarity_method="cosine", filter_criteria=None):
    # Step 1: Embed the query
    query_embedding = generate_query_embedding(query)

    # Step 2: Retrieve document embeddings and metadata
    document_embeddings = collection.get_all_embeddings()  # Replace with your actual call to retrieve embeddings
    documents = collection.get_all_documents()
    metadata = collection.get_all_metadata()  # Assume metadata is stored with each document

    # Step 3: Calculate similarity scores
    similarities = similarity_search(query_embedding, document_embeddings, method=similarity_method)

    # Step 4: Apply filtering based on criteria
    if filter_criteria:
        documents = filter_documents(documents, metadata, filter_criteria)

    # Step 5: Rank documents by similarity
    ranked_docs, ranked_similarities = rank_documents(documents, similarities, top_k=top_k)

    # Step 6: Format results
    result = {
        "documents": ranked_docs,
        "scores": ranked_similarities
    }
    return result


# 4 Generation System

Import Libraries



In [15]:
import openai


Set up the OpenAI API Key


In [16]:
openai.api_key = ""


# Context Construction
1 Prompt Templates: Define templates to structure prompts consistently.

2 Context Management: Manage prompt length and context to fit within token limits.

3 History Handling: Use chat history for continuity in multi-turn conversations.

In [17]:
# Define a base prompt template for the chatbot
BASE_PROMPT_TEMPLATE = """
You are a financial assistant chatbot helping with investment-related questions. Use the following context to answer:

Context:
{context}

Question:
{query}

Response:
"""

# Function to construct the prompt with context and query
def construct_prompt(context, query):
    return BASE_PROMPT_TEMPLATE.format(context=context, query=query)

# History management (optional for multi-turn conversations)
def update_context(history, max_tokens=200):
    # Concatenate history for context management
    full_context = " ".join(history)
    return full_context[-max_tokens:]  # Trim to max tokens


# Language Model Setup
API-based models: Here, GPT-based models are used to generate responses based on context and query.

In [18]:
# Function to generate a response using GPT-3.5 or GPT-4
def generate_response(prompt, model="gpt-3.5-turbo", temperature=0.7, top_p=0.9):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        top_p=top_p,
        max_tokens=150
    )
    return response['choices'][0]['message']['content']


# Response Generation
Controls for temperature, sampling, and beam search:

- Temperature: Adjusts randomness in responses (0.0 = deterministic, 1.0 = creative).
-Top-k/Top-p Sampling: For diverse and meaningful outputs.
- Beam Search: Generally more complex and used in models that support it, here replaced with sampling.

In [19]:
def get_response(context, query, history=None, temperature=0.7, top_p=0.9):
    # Step 1: Manage Context Window
    if history:
        context = update_context(history)

    # Step 2: Construct the Prompt
    prompt = construct_prompt(context, query)

    # Step 3: Generate the Response
    response = generate_response(prompt, temperature=temperature, top_p=top_p)

    # Update history if needed
    if history is not None:
        history.append(f"User: {query}\nBot: {response}")

    return response


# Full Retrieval and Generation Pipeline Integration
Combine the retrieval and generation stages to build a cohesive chatbot function.

In [20]:
# Main chatbot function that integrates retrieval and generation
def chatbot_pipeline(query, top_k=3, similarity_method="cosine", filter_criteria=None, temperature=0.7, top_p=0.9):
    # Step 1: Retrieve relevant documents
    retrieval_results = retrieve_and_rank_documents(query, top_k=top_k, similarity_method=similarity_method, filter_criteria=filter_criteria)
    context = " ".join(retrieval_results["documents"])  # Combine top documents as context

    # Step 2: Generate a response using the context and query
    response = get_response(context, query, temperature=temperature, top_p=top_p)

    return response


# 5 Additional Components Code
**Memory Systems**
A memory system allows the chatbot to retain useful information across conversations or provide context based on past interactions:

**Short-term Memory:** Used to retain recent conversation history for coherent multi-turn interactions.

**Long-term Memory**: Stores knowledge across sessions, helping the bot recall important details or user preferences.

In [21]:
# Memory to store short-term conversation history
class MemorySystem:
    def __init__(self):
        self.short_term_memory = []
        self.long_term_memory = []

    def update_short_term(self, conversation, max_length=5):
        # Store recent conversation exchanges, limiting to 'max_length'
        self.short_term_memory.append(conversation)
        if len(self.short_term_memory) > max_length:
            self.short_term_memory.pop(0)  # Remove oldest entry

    def store_long_term(self, info):
        # Append information to long-term knowledge storage
        self.long_term_memory.append(info)

    def get_short_term_context(self):
        return " ".join(self.short_term_memory)

    def clear_short_term_memory(self):
        self.short_term_memory = []


# Evaluation Metrics
Tracking the chatbot's performance by evaluating response relevance, accuracy, and retrieval precision can help fine-tune the model. Here, a simplified logging system can track the quality of interactions.

In [22]:
# Function to evaluate responses
class EvaluationMetrics:
    def __init__(self):
        self.logs = []

    def log_interaction(self, query, response, relevance_score, correctness_score):
        self.logs.append({
            "query": query,
            "response": response,
            "relevance": relevance_score,
            "correctness": correctness_score
        })

    def calculate_average_scores(self):
        relevance_avg = sum(log['relevance'] for log in self.logs) / len(self.logs) if self.logs else 0
        correctness_avg = sum(log['correctness'] for log in self.logs) / len(self.logs) if self.logs else 0
        return {"avg_relevance": relevance_avg, "avg_correctness": correctness_avg}


**Relevance:** Assesses how well the response aligns with the query.

---

**Correctness**:  Scores the factual accuracy of responses.
This scoring can be done manually for initial interactions or automated with user feedback later.

# Monitoring & Logging
Logging the chatbot’s interactions and performance helps with troubleshooting and improving response quality.

In [23]:
import logging

# Configure the logging system
logging.basicConfig(filename='chatbot.log', level=logging.INFO, format='%(asctime)s - %(message)s')

class MonitoringSystem:
    def log_performance(self, query, response, relevance, correctness):
        logging.info(f"Query: {query}, Response: {response}, Relevance: {relevance}, Correctness: {correctness}")

    def log_error(self, error_msg):
        logging.error(f"Error: {error_msg}")

    def log_usage(self, query, user_info):
        logging.info(f"User: {user_info}, Query: {query}")


- **Performance Tracking:** Logs relevance and correctness for model improvement.
- **Error Handling:** Captures errors to help refine error-handling mechanisms.
- **Usage Analytics:** Tracks user queries and metadata for usage trends.

# Integrating Additional Components
Now, integrate these components into the chatbot_pipeline:

In [24]:
# Instantiate the memory, evaluation, and monitoring systems
memory_system = MemorySystem()
evaluation_metrics = EvaluationMetrics()
monitoring_system = MonitoringSystem()

def chatbot_pipeline(query, user_info=None, top_k=3, similarity_method="cosine", filter_criteria=None, temperature=0.7, top_p=0.9):
    try:
        # Step 1: Retrieve relevant documents
        retrieval_results = retrieve_and_rank_documents(query, top_k=top_k, similarity_method=similarity_method, filter_criteria=filter_criteria)
        context = " ".join(retrieval_results["documents"])

        # Step 2: Generate response with updated context
        short_term_context = memory_system.get_short_term_context()
        full_context = f"{short_term_context} {context}"
        response = get_response(full_context, query, temperature=temperature, top_p=top_p)

        # Step 3: Evaluate and log response
        relevance_score, correctness_score = 1.0, 1.0  # Placeholder values; replace with actual scoring logic
        evaluation_metrics.log_interaction(query, response, relevance_score, correctness_score)
        monitoring_system.log_performance(query, response, relevance_score, correctness_score)
        if user_info:
            monitoring_system.log_usage(query, user_info)

        # Step 4: Update short-term memory
        memory_system.update_short_term(f"User: {query}\nBot: {response}")

        return response

    except Exception as e:
        error_msg = str(e)
        monitoring_system.log_error(error_msg)
        return "I'm sorry, but there was an error processing your request."


# **6 -Full Gradio Interface Code**
The interface will allow users to interact with the chatbot, with Gradio managing the user input and displaying the generated responses.

In [25]:
import traceback

def chatbot_interface(input_text):
    try:
        # Call your chatbot processing functions here
        result = process_chatbot(input_text)  # Replace with your function
        return result
    except Exception as e:
        error_message = f"An error occurred: {str(e)}\n{traceback.format_exc()}"
        print(error_message)  # Logs the detailed error traceback
        return "I'm sorry, but there was an error processing your request."



In [26]:
import gradio as gr

# Define Gradio interface function
def gradio_chatbot_interface(user_query):
    # Pass the user query into the chatbot pipeline and get the response
    response = chatbot_pipeline(query=user_query, user_info="User123")  # Replace with actual user info if available
    return response

# Define the Gradio interface layout
with gr.Blocks() as demo:
    gr.Markdown("## RAG-Based Financial Chatbot")

    with gr.Row():
        with gr.Column():
            user_query = gr.Textbox(label="Your Query", placeholder="Ask a question about financial data...")
            submit_button = gr.Button("Submit")

        with gr.Column():
            bot_response = gr.Textbox(label="Chatbot Response", placeholder="The bot's response will appear here...", interactive=False)

    # Define button interaction with the function
    submit_button.click(fn=gradio_chatbot_interface, inputs=user_query, outputs=bot_response)

# Launch the interface
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [27]:
!pip uninstall -y huggingface_hub sentence-transformers


Found existing installation: huggingface-hub 0.26.3
Uninstalling huggingface-hub-0.26.3:
  Successfully uninstalled huggingface-hub-0.26.3
Found existing installation: sentence-transformers 3.3.1
Uninstalling sentence-transformers-3.3.1:
  Successfully uninstalled sentence-transformers-3.3.1


In [28]:
  pip install openai chromadb PyMuPDF requests beautifulsoup4 transformers gradio nltk torch scikit-learn sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Using cached huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Using cached sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Using cached huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
Installing collected packages: huggingface-hub, sentence-transformers
Successfully installed huggingface-hub-0.26.3 sentence-transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [29]:
pip install openai==1.12.0 gradio

Note: you may need to restart the kernel to use updated packages.


In [30]:
import gradio as gr
from openai import OpenAI
import logging
import traceback

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize OpenAI client
client = OpenAI(api_key="")  # Replace with your actual API key

class ChatHistory:
    def __init__(self, max_history=10):
        self.history = []
        self.max_history = max_history
    
    def add_interaction(self, query, response):
        self.history.append((query, response))
        if len(self.history) > self.max_history:
            self.history.pop(0)
    
    def get_history(self):
        return self.history
    
    def clear_history(self):
        self.history = []

def generate_response(prompt, model="gpt-3.5-turbo", temperature=0.7):
    try:
        messages = [
            {"role": "system", "content": "You are a helpful financial assistant. Provide clear and concise answers to financial questions."},
            {"role": "user", "content": prompt}
        ]
        
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=150
        )
        return response.choices[0].message.content
    except Exception as e:
        logging.error(f"Error generating response: {str(e)}")
        raise

# Initialize chat history
chat_history = ChatHistory()

def build_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# Financial Assistant")
        
        with gr.Row():
            with gr.Column(scale=2):
                query = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask any financial question...",
                    lines=3
                )
                
                submit_btn = gr.Button("Submit", variant="primary")
                clear_btn = gr.Button("Clear")

            with gr.Column(scale=3):
                chatbot = gr.Chatbot(label="Conversation")
                response_status = gr.Textbox(
                    label="Status",
                    placeholder="Ready for your questions...",
                    interactive=False
                )

        def process_query(user_query):
            try:
                logging.debug(f"Processing query: {user_query}")
                
                # Generate response
                response = generate_response(user_query)
                
                # Update chat history
                chat_history.add_interaction(user_query, response)
                return chat_history.get_history(), "Response generated successfully"

            except Exception as e:
                error_message = f"An error occurred: {str(e)}\n{traceback.format_exc()}"
                logging.error(error_message)
                return [[user_query, "I apologize, but I encountered an error processing your request. Please try again."]], error_message

        def clear_outputs():
            chat_history.clear_history()
            return None, "Ready for your questions..."

        submit_btn.click(
            process_query,
            inputs=[query],
            outputs=[chatbot, response_status]
        )
        
        clear_btn.click(
            clear_outputs,
            outputs=[chatbot, response_status]
        )

    demo.launch(share=True)

# Main execution
if __name__ == "__main__":
    build_gradio_interface()



* Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


In [31]:
pip install openai==1.12.0 gradio PyMuPDF nltk

Note: you may need to restart the kernel to use updated packages.


In [32]:
# Instead of the GUI downloader, directly download only the required package
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download only the punkt package
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

WITH IMAGE AND WEBSITE

In [33]:
pip install Pillow pytesseract beautifulsoup4 validators

Note: you may need to restart the kernel to use updated packages.


In [34]:
pip install openai==1.12.0 gradio PyMuPDF nltk Pillow pytesseract beautifulsoup4 validators

Note: you may need to restart the kernel to use updated packages.


PDF - TEXT - URL --> INPUT 

qpen ADA

In [None]:
import gradio as gr 
from openai import OpenAI
import logging
import traceback
import fitz  # PyMuPDF
import re
import nltk
import ssl
from nltk.tokenize import sent_tokenize
import requests
from bs4 import BeautifulSoup
import validators
import chromadb
from chromadb.config import Settings

# SSL Configuration for NLTK downloads
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize OpenAI client
client = OpenAI(api_key="")  # Replace with your actual API key

# Singleton pattern for ChromaDB client
chroma_client = None
collection = None

def get_chroma_client():
    global chroma_client, collection
    if chroma_client is None:
        try:
            chroma_client = chromadb.Client(
                Settings(
                    chroma_db_impl="duckdb+parquet",
                    persist_directory="./chroma_db"  # Path to store vectors
                )
            )
            collection = chroma_client.get_or_create_collection(name="financial_data")
        except Exception as e:
            logging.error(f"Error initializing ChromaDB: {str(e)}")
            raise  # Raise the error to stop execution if ChromaDB fails to initialize
    return chroma_client, collection

class ChatHistory:
    def __init__(self, max_history=10):
        self.history = []
        self.max_history = max_history
        self.current_context = ""
    
    def add_interaction(self, query, response):
        self.history.append((query, response))
        if len(self.history) > self.max_history:
            self.history.pop(0)
    
    def get_history(self):
        return self.history
    
    def clear_history(self):
        self.history = []
        self.current_context = ""
    
    def set_context(self, context):
        self.current_context = context

def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

def chunk_text(text, chunk_size=1000):
    """Split text into chunks"""
    if not text:
        return []
    try:
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0
        
        for sentence in sentences:
            current_length += len(sentence)
            current_chunk.append(sentence)
            
            if current_length >= chunk_size:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_length = 0
                
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            
        return chunks
    except Exception as e:
        logging.error(f"Error in chunk_text: {str(e)}")
        words = text.split()
        return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def extract_text_from_pdf(file_path):
    """Extract text from PDF file"""
    if not file_path:
        return ""
    try:
        doc = fitz.open(file_path)
        text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            text += page.get_text()
        doc.close()
        return clean_text(text)
    except Exception as e:
        logging.error(f"Error processing PDF: {str(e)}")
        return ""

def extract_text_from_url(url):
    """Extract text from website URL"""
    if not url or not url.strip():
        return ""
    try:
        # Validate URL
        if not validators.url(url):
            raise ValueError("Invalid URL format")
        
        # Add headers to mimic a browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Fetch webpage content with timeout and headers
        response = requests.get(url.strip(), headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
            element.decompose()
        
        # Get text content
        text = soup.get_text()
        return clean_text(text)
    except Exception as e:
        logging.error(f"Error processing URL: {str(e)}")
        return ""

def generate_embeddings(text):
    """Generate embeddings using OpenAI ADA model"""
    try:
        response = client.embeddings.create(model="text-embedding-ada-002", input=text)
        return response['data'][0]['embedding']
    except Exception as e:
        logging.error(f"Error generating embeddings: {str(e)}")
        return None

def store_embeddings(text, source):
    """Store embeddings in ChromaDB"""
    embedding = generate_embeddings(text)
    if embedding:
        chroma_client, collection = get_chroma_client()  # Ensure client is initialized
        collection.add(
            documents=[text],
            metadatas=[{"source": source}],
            ids=[f"{source}_{len(collection)}"]
        )
    else:
        logging.error("Failed to store embedding")

def similarity_search(query, top_k=3):
    """Perform similarity search using ChromaDB"""
    embedding = generate_embeddings(query)
    if embedding:
        chroma_client, collection = get_chroma_client()  # Ensure client is initialized
        results = collection.query(
            query_embeddings=[embedding],
            n_results=top_k
        )
        return results['documents']
    else:
        return []

def generate_response(prompt, context="", model="gpt-3.5-turbo", temperature=0.7):
    """Generate response using OpenAI API"""
    try:
        messages = [
            {
                "role": "system",
                "content": "You are a helpful financial assistant. Provide clear and concise answers to financial questions."
            }
        ]
        
        if context:
            messages.append({
                "role": "system",
                "content": f"Use this context for your response: {context}"
            })
        
        messages.append({"role": "user", "content": prompt})
        
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=500
        )
        return response.choices[0].message.content
    except Exception as e:
        logging.error(f"Error generating response: {str(e)}")
        return "I apologize, but I encountered an error generating the response. Please try again."

# Main Gradio interface setup
def build_gradio_interface():
    """Build and launch the Gradio interface"""
    with gr.Blocks() as demo:
        gr.Markdown("# Financial Assistant with PDF and URL Support")
        
        with gr.Row():
            with gr.Column(scale=2):
                query = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask any financial question...",
                    lines=3
                )
                
                file_upload = gr.File(
                    label="Upload PDF Document (Optional)",
                    file_types=[".pdf"],
                    type="filepath"
                )
                
                url_input = gr.Textbox(
                    label="Enter URL (Optional)",
                    placeholder="Paste a URL to extract text...",
                    lines=1
                )
                
                submit_btn = gr.Button("Submit", variant="primary")
                clear_btn = gr.Button("Clear")

            with gr.Column(scale=3):
                chatbot = gr.Chatbot(label="Conversation")
                response_status = gr.Textbox(
                    label="Status",
                    placeholder="Ready for your questions...",
                    interactive=False
                )

        def process_query(user_query, file_path, url):
            """Process user query and generate response"""
            try:
                context = ""
                if file_path:
                    logging.debug(f"Processing PDF file: {file_path}")
                    pdf_text = extract_text_from_pdf(file_path)
                    chunks = chunk_text(pdf_text)
                    context = " ".join(chunks[:3])  # Use first 3 chunks as context
                elif url:
                    logging.debug(f"Processing URL: {url}")
                    url_text = extract_text_from_url(url)
                    chunks = chunk_text(url_text)
                    context = " ".join(chunks[:3])  # Use first 3 chunks as context
                
                logging.debug(f"Generating response for query: {user_query}")
                response = generate_response(user_query, context=context)
                
                return [(user_query, response)], "Response generated successfully"

            except Exception as e:
                error_message = f"An error occurred: {str(e)}"
                logging.error(f"{error_message}\n{traceback.format_exc()}")
                return [[user_query, "I apologize, but I encountered an error processing your request. Please try again."]], error_message

        # Bind functions to Gradio interface
        submit_btn.click(process_query, inputs=[query, file_upload, url_input], outputs=[chatbot, response_status])
        clear_btn.click(lambda: ([], ""), None, [chatbot, response_status])

    demo.launch(debug=True, share=True)

# Start the Gradio interface
build_gradio_interface()




* Running on local URL:  http://127.0.0.1:7862

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.
