# RAG Based Implementation

## Pip Installs and Import Statements

In [None]:
# Install and upgrade necessary libraries for RAG Pipeline and Topic Modeling

# Install core libraries for RAG pipeline
!pip install langchain-huggingface langchain chromadb pypdf sentence-transformers accelerate langchain-community pypdf2 torch

# Upgrade libraries to the latest versions
!pip install --upgrade langchain transformers sentence-transformers langchain-huggingface chromadb langchain_chroma

# Install and upgrade additional libraries
!pip install tiktoken
!pip install --upgrade torch torchvision torchaudio transformers sentence-transformers
!pip install -U langchain_chroma
!pip install -U langchain_huggingface langchain_chroma

# Install libraries for topic modeling
!pip install bertopic
!pip install nltk bertopic
!pip install pyLDAvis

# Download Spacy English model for NER
!python -m spacy download en_core_web_md


In [None]:
# =====================================================================
# 1. STANDARD LIBRARIES
# =====================================================================
import logging
import re
from typing import List, Dict, Set
import os
import sys

# =====================================================================
# 2. THIRD-PARTY LIBRARIES
# =====================================================================
# 2.1 Data Analysis
import pandas as pd

# 2.2 Machine Learning and Deep Learning
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# 2.3 Natural Language Processing (NLP)
from nltk.corpus import stopwords
from nltk import download
import spacy

# 2.4 Visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# 2.5 Topic Modeling
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

# 2.6 Gensim Topic Modeling
from gensim.models import LdaModel
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary

# =====================================================================
# 3. LANGCHAIN MODULES
# =====================================================================
## 3.1 Document Management
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

## 3.2 Prompts and Chains
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import LLMChain

## 3.3 Embeddings and Vector Store
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# =====================================================================
# 4. HUGGING FACE INTEGRATION
# =====================================================================
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings
from transformers import pipeline as hf_pipeline


In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Log configurations

In [None]:
# Create or get logger
logger = logging.getLogger("RAGRetriever")

# Clear existing handlers
if logger.hasHandlers():
    logger.handlers.clear()

# Disable propagation to ancestor loggers
logger.propagate = False

# Set logger level
logger.setLevel(logging.INFO)

# Formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# File handler
file_handler = logging.FileHandler("pdf_processing_errors.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

# Example usage
logger.info("Logging system is configured.")


### Preprocessing Utilities

Purpose: These functions handle text preprocessing for cleaning, tokenizing, and preparing text for embedding, summarization, or topic modeling.

**Functions:**

- normalize_metadata
- normalize_metadata_values
- preprocess_text
- preprocess_text_gensim
- preprocess_corpus_gensim
- preprocess_corpus_bertopic
- remove_repetitions
- get_page_counts

In [None]:
# Download NLTK stopwords
download("stopwords")
DEFAULT_STOPWORDS = set(stopwords.words("english"))

# Load Spacy NLP model for NER
nlp_model = spacy.load("en_core_web_md")

In [None]:
def normalize_metadata_values(metadata: Dict, required_fields: Set[str], case: str = "lower") -> Dict:
    """
    Normalizes metadata fields and ensures required fields are present.
    Args:
        metadata (dict): Metadata dictionary.
        required_fields (set): Set of required metadata fields.
        case (str): Case normalization ('lower', 'upper', or 'title').
    Returns:
        dict: Normalized metadata.
    """
    normalized_metadata = {}

    for key in required_fields:
        value = metadata.get(key)
        normalized_metadata[key] = "N/A" if value is None else str(value).strip().lower() if case == "lower" else str(value).strip().upper()

    return normalized_metadata

def normalize_metadata(metadata_df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalizes the metadata DataFrame by standardizing text fields and handling missing values.
    Args:
        metadata_df (pd.DataFrame): The metadata DataFrame to normalize.
    Returns:
        pd.DataFrame: The normalized metadata DataFrame.
    """
    try:
        logger.info("Starting metadata normalization...")

        required_fields = {"Year", "Quarter", "Bank", "Designation", "Name", "Source"}

        # Check for required columns
        missing_columns = required_fields - set(metadata_df.columns)
        if missing_columns:
            raise KeyError(f"Metadata DataFrame is missing required columns: {missing_columns}")

        # Normalize all fields using `normalize_metadata_values`
        normalized_metadata = metadata_df.apply(
            lambda row: normalize_metadata_values(row.to_dict(), required_fields, case="lower"), axis=1
        )
        normalized_df = pd.DataFrame(list(normalized_metadata))

        logger.info(f"Metadata normalization completed successfully for {len(metadata_df)} rows.")
        return normalized_df

    except KeyError as ke:
        logger.error(f"Metadata normalization error: {ke}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error during metadata normalization: {e}")
        raise


In [None]:
def preprocess_text(
    text: str,
    nlp_model=None,
    custom_stopwords=None,
    redundant_terms=None
) -> list:
    """
    Preprocess text for BERTopic.

    Args:
        text (str): Raw text input.
        nlp_model (spacy.Language): SpaCy model for NER.
        custom_stopwords (list): Additional stopwords.
        redundant_terms (list): Terms to explicitly remove.

    Returns:
        list: Tokenized and cleaned words.
    """
    try:
        # Normalize text
        text = text.lower()

        # Remove URLs
        text = re.sub(r"https?://\S+|www\.\S+", "", text)

        # Remove emails
        text = re.sub(r"\S+@\S+", "", text)

        # Use NER to remove names
        if nlp_model:
            doc = nlp_model(text)
            filtered_tokens = [token.text for token in doc if token.ent_type_ != "PERSON"]
            text = " ".join(filtered_tokens)

        # Default redundant terms
        default_redundant_terms = [
            "thank you", "good morning", "good afternoon", "earnings call",
            "conference call", "slide", "question", "operator"
        ]
        redundant_terms = set(redundant_terms or []).union(default_redundant_terms)

        # Remove redundant terms
        for term in redundant_terms:
            text = re.sub(rf"\b{re.escape(term)}\b", "", text)

        # Remove special characters and normalize whitespace
        text = re.sub(r"[^a-z\s]", "", text).strip()
        text = re.sub(r"\s+", " ", text)

        # Tokenize and remove stopwords
        stop_words = DEFAULT_STOPWORDS.union(custom_stopwords or [])
        tokens = [word for word in text.split() if word not in stop_words]

        return tokens
    except Exception as e:
        logger.error(f"Error during text preprocessing: {e}")
        return []

def preprocess_corpus_bertopic(
    corpus, nlp_model=None, custom_stopwords=None, redundant_terms=None, min_count=2, threshold=5
):
    """
    Preprocess a corpus for BERTopic and optionally generate bigrams and trigrams.

    Args:
        corpus (list of str): List of raw text documents.
        nlp_model (spacy.Language): Spacy NER model for removing names (PERSON entities).
        custom_stopwords (list): Additional stopwords to remove.
        redundant_terms (list): Terms to explicitly remove from text.
        min_count (int): Minimum count for bigram detection.
        threshold (int): Phrase scoring threshold for bigram detection.

    Returns:
        list of str: Preprocessed text documents (joined tokens).
    """
    logger.info("Starting corpus preprocessing for BERTopic...")

    def remove_urls(text):
        """Remove URLs from the text."""
        return re.sub(r"https?://\S+|www\.\S+", "", text)

    # Preprocess each document in the corpus
    logger.info("Preprocessing individual documents...")
    tokenized_corpus = []
    for doc in corpus:
        # Normalize text and remove URLs
        doc = remove_urls(doc)

        # Tokenize, filter stopwords, and remove redundant terms
        tokens = preprocess_text(doc, nlp_model, custom_stopwords, redundant_terms)
        tokenized_corpus.append(tokens)

    # Log tokenized sample
    logger.info("Sample preprocessed tokens:")
    logger.info(tokenized_corpus[:2])

    # Build bigram and trigram models with adjusted parameters
    logger.info("Building bigram and trigram models...")
    bigram_model = Phrases(tokenized_corpus, min_count=min_count, threshold=threshold)
    trigram_model = Phrases(bigram_model[tokenized_corpus], threshold=threshold)
    bigram_phraser = Phraser(bigram_model)
    trigram_phraser = Phraser(trigram_model)

    # Apply bigram and trigram models to the corpus
    logger.info("Applying bigram and trigram models...")
    processed_corpus = []
    for doc in tokenized_corpus:
        phrases = trigram_phraser[bigram_phraser[doc]]
        # Join tokens back into strings for BERTopic
        processed_corpus.append(" ".join(phrases))

    # Log sample processed corpus
    logger.info("Sample processed documents for BERTopic:")
    logger.info(processed_corpus[:2])

    return processed_corpus


def preprocess_text_gensim(text, nlp_model=None, custom_stopwords=None, redundant_terms=None):
    """
    Preprocess text for LDA topic modeling, focusing on bigrams and trigrams.

    Args:
        text (str): Raw text to preprocess.
        nlp_model (spacy.Language): Spacy NER model for removing names (PERSON entities).
        custom_stopwords (list): Additional stopwords to remove.
        redundant_terms (list): Terms to explicitly remove from text.

    Returns:
        list: Tokenized and preprocessed words.
    """
    # Normalize text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # Remove names using Spacy NER if model is providedz
    if nlp_model:
        doc = nlp_model(text)
        filtered_tokens = [token.text for token in doc if token.ent_type_ != "PERSON"]
        text = " ".join(filtered_tokens)

    # Default redundant terms to remove
    default_redundant_terms = [
        "seeking_alpha", "thank_much", "group_ag_cs_results", "transcript_seeking_alpha",
        "good_morning", "good_afternoon", "earnings_call", "company", "presentation",
        "analyst", "operator", "thomas", "gottstein"
    ]

    # Merge default and custom redundant terms
    if redundant_terms:
        redundant_terms.extend(default_redundant_terms)
    else:
        redundant_terms = default_redundant_terms

    # Remove redundant terms using regex
    for term in redundant_terms:
        text = re.sub(rf"\b{re.escape(term.lower())}\b", "", text)

    # Remove special characters, digits, and extra whitespace
    text = re.sub(r"[^a-z\s]", "", text)  # Remove non-alphabetic characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace

    # Tokenize and remove stopwords
    stop_words = DEFAULT_STOPWORDS
    if custom_stopwords:
        stop_words.update(custom_stopwords)

    tokens = [word for word in text.split() if word not in stop_words]

    return tokens

def preprocess_corpus_gensim(
    corpus, nlp_model=None, custom_stopwords=None, redundant_terms=None, extra_redundant_terms=None, min_count=2, threshold=5, keep_unigrams=False
):
    """
    Preprocess a corpus for LDA topic modeling and generate bigrams and trigrams.

    Args:
        corpus (list of str): List of raw text documents.
        nlp_model (spacy.Language): Spacy NER model for removing names (PERSON entities).
        custom_stopwords (list): Additional stopwords to remove.
        redundant_terms (list): Terms to explicitly remove from text.
        min_count (int): Minimum count for bigram/trigram detection.
        threshold (int): Phrase scoring threshold for bigram/trigram detection.
        keep_unigrams (bool): If True, include unigrams alongside bigrams/trigrams.

    Returns:
        list of list: Preprocessed tokenized documents with bigrams/trigrams (and optionally unigrams).
    """
    logger.info("Starting corpus preprocessing for bigrams and trigrams...")

    def remove_urls(text):
        """Remove URLs from the text."""
        return re.sub(r"https?://\S+|www\.\S+", "", text)

    # Preprocess each document in the corpus
    logger.info("Preprocessing individual documents...")
    tokenized_corpus = []
    for doc in corpus:
        # Normalize text and remove URLs
        doc = remove_urls(doc)

        # Tokenize, filter stopwords, and remove redundant terms
        tokens = preprocess_text_gensim(doc, nlp_model, custom_stopwords, redundant_terms)
        tokenized_corpus.append(tokens)

    # Log tokenized sample
    logger.info("Sample preprocessed tokens:")
    logger.info(tokenized_corpus[:2])

    # Build bigram and trigram models with adjusted parameters
    logger.info("Building bigram and trigram models...")
    bigram_model = Phrases(tokenized_corpus, min_count=min_count, threshold=threshold)
    trigram_model = Phrases(bigram_model[tokenized_corpus], threshold=threshold)
    bigram_phraser = Phraser(bigram_model)
    trigram_phraser = Phraser(trigram_model)

    # Apply bigram and trigram models to the corpus
    logger.info("Applying bigram and trigram models...")
    processed_corpus = []
    for doc in tokenized_corpus:
        phrases = trigram_phraser[bigram_phraser[doc]]
        if keep_unigrams:
            # Include unigrams alongside bigrams/trigrams
            processed_corpus.append(list(phrases))
        else:
            # Keep only bigrams/trigrams
            processed_corpus.append([token for token in phrases if "_" in token])

    # Log sample processed corpus
    logger.info("Sample processed documents with bigrams/trigrams:")
    logger.info(processed_corpus[:2])

    return processed_corpus


### Pipeline Configuration

Purpose: These functions set up the overall Retrieval-Augmented Generation (RAG) pipeline, including metadata handling, document loading, embedding generation, and retriever initialization.

**Functions:**
- load_metadata
- load_pdf_file
- load_documents
- attach_metadata_to_documents
- propagate_metadata_to_chunks
- validate_metadata
- initialize_embeddings_and_vector_store
- setup_retriever
- load_model
- configure_rag_pipeline

In [None]:
def load_metadata(metadata_path: str) -> pd.DataFrame:
    """
    Loads and normalizes metadata from the specified CSV file.
    Args:
        metadata_path (str): Path to the metadata CSV file.
    Returns:
        pd.DataFrame: The normalized metadata DataFrame.
    """
    try:
        logger.info(f"Loading metadata from: {metadata_path}")

        metadata_df = pd.read_csv(metadata_path)
        logger.info(f"Loaded {len(metadata_df)} rows of metadata.")

        # Rename 'File' column to 'Source' if necessary
        if "File" in metadata_df.columns:
            metadata_df.rename(columns={"File": "Source"}, inplace=True)

        # Normalize the metadata
        metadata_df = normalize_metadata(metadata_df)

        logger.info(f"Loaded data with meta data completed successfully for {len(metadata_df)} rows.")
        return metadata_df

    except pd.errors.EmptyDataError:
        logger.error("The metadata file is empty or improperly formatted.")
        raise ValueError("The metadata file is empty or improperly formatted.")
    except KeyError as ke:
        logger.error(f"Metadata normalization error: {ke}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error during metadata loading: {e}")
        raise


def load_pdf_file(file_path: str) -> List[Document]:
    """
    Loads and splits a single PDF file into document objects.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        List[Document]: A list of document objects with metadata.
    """
    try:
        loader = PyPDFLoader(file_path)
        pdf_documents = loader.load_and_split()
        logger.info(f"Loaded {len(pdf_documents)} pages from {file_path}")

        # Assign normalized source metadata to each document
        for doc in pdf_documents:
            doc.metadata = {"source": os.path.basename(file_path).strip().lower()}

        return pdf_documents
    except Exception as e:
        logger.error(f"Error loading or processing PDF '{file_path}': {e}")
        return []

def load_documents(folder_path: str) -> List[Document]:
    """
    Loads PDF documents from the specified folder.
    Args:
        folder_path (str): Path to the folder containing PDF files.
    Returns:
        List[Document]: A list of document objects with metadata.
    """
    try:
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Specified folder path does not exist: {folder_path}")

        logger.info(f"Loading PDF files from folder: {folder_path}")

        documents = [
            doc
            for filename in os.listdir(folder_path)
            if filename.lower().endswith(".pdf")
            for doc in load_pdf_file(os.path.join(folder_path, filename))
        ]

        if not documents:
            logger.warning("No valid PDF documents were loaded from the folder.")
        else:
            logger.info(f"Successfully loaded {len(documents)} documents from {folder_path}.")

        return documents
    except Exception as e:
        logger.error(f"An error occurred while loading documents: {e}")
        return []

In [None]:
def attach_metadata_to_documents(documents: List[Document], metadata_df: pd.DataFrame) -> List[Document]:
    """
    Attaches metadata to documents based on their source.
    Args:
        documents (List[Document]): List of document objects.
        metadata_df (pd.DataFrame): Normalized metadata DataFrame.
    Returns:
        List[Document]: List of documents with enriched metadata.
    """
    try:
        if "Source" not in metadata_df.columns:
            raise KeyError("The metadata DataFrame is missing the 'Source' column.")

        enriched_documents = []

        for doc in documents:
            source_file = os.path.basename(doc.metadata.get("source", "")).lower().strip()
            logger.debug(f"Processing document with source: {source_file}")

            matched_metadata = metadata_df[metadata_df["Source"] == source_file]

            if not matched_metadata.empty:
                metadata_dict = matched_metadata.iloc[0].to_dict()
                doc.metadata = {
                    key.lower(): str(value).strip() if pd.notna(value) else "n/a"
                    for key, value in metadata_dict.items()
                }
            else:
                logger.warning(f"No metadata match found for source: {source_file}")

            enriched_documents.append(doc)

        return enriched_documents
    except Exception as e:
        logger.error(f"Error attaching metadata to documents: {e}")
        raise

def propagate_metadata_to_chunks(documents: List[Document], chunk_size=512, chunk_overlap=50) -> List[Document]:
    """
    Splits documents into token-based chunks and propagates normalized metadata to each chunk.

    Args:
        documents (list): List of documents with metadata.
        chunk_size (int): Maximum number of tokens per chunk.
        chunk_overlap (int): Number of overlapping tokens between chunks.

    Returns:
        List[Document]: A list of document chunks with normalized metadata.
    """
    # Token-based text splitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    # Load tokenizer for token validation
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

    enriched_chunks = []

    for doc_idx, doc in enumerate(documents):
        # Split document into token-based chunks
        chunks = text_splitter.split_documents([doc])

        for chunk_idx, chunk in enumerate(chunks):
            # Propagate metadata to the chunk
            chunk.metadata.update(doc.metadata)
            chunk.metadata["chunk_id"] = f"{doc.metadata.get('source', 'unknown')}_{doc_idx}_{chunk_idx}"

            # Estimate token count and log
            token_count = len(tokenizer.encode(chunk.page_content, truncation=False))
            logger.debug(f"Chunk {chunk_idx} from Document {doc_idx} has {token_count} tokens.")
            if token_count > 1024:
                logger.warning(f"Chunk {chunk_idx} exceeds token limit with {token_count} tokens.")

            enriched_chunks.append(chunk)

    logger.info(f"Generated {len(enriched_chunks)} enriched chunks.")
    return enriched_chunks

def validate_metadata(documents: List[Document], required_fields: Set[str] = None) -> List[Document]:
    """
    Validates and standardizes metadata for a list of documents.
    """
    if required_fields is None:
        required_fields = {"year", "quarter", "bank", "designation", "name", "source"}

    validated_documents = []
    try:
        for doc in documents:
            if hasattr(doc, "metadata") and isinstance(doc.metadata, dict):
                for field in required_fields:
                    if field not in doc.metadata or not doc.metadata[field]:
                        doc.metadata[field] = "Unknown"
                validated_documents.append(doc)
        return validated_documents
    except Exception as e:
        logger.error(f"Error in validate_metadata: {e}")
        return documents

In [None]:
def initialize_embeddings_and_vector_store(
    chunks,
    embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2",
    persist_directory="qa_index",
    use_existing=True,
):
    """
    Initializes embeddings and vector store.

    Args:
        chunks (list): List of document chunks for embedding.
        embeddings_model_name (str): Hugging Face model to generate embeddings.
        persist_directory (str): Path to persist or load the vector store.
        use_existing (bool): If True, load an existing vector store if available.

    Returns:
        Chroma: Initialized or loaded vector store instance.
    """
    try:
        if not chunks:
            raise ValueError("No chunks provided for vector store initialization.")

        # Ensure the persist directory exists
        os.makedirs(persist_directory, exist_ok=True)
        logger.info(f"Using embedding model: {embeddings_model_name}")

        # Initialize embeddings using the specified model
        embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

        # Check if an existing vector store should be used
        vector_store = None
        if use_existing and os.path.exists(persist_directory) and os.listdir(persist_directory):
            try:
                logger.info(f"Loading existing vector store from: {persist_directory}")
                vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

                # Check if the vector store contains any embeddings
                if vector_store._collection.count() == 0:
                    logger.warning("Existing vector store is empty. Rebuilding with provided chunks...")
                    vector_store = Chroma.from_documents(
                        documents=chunks, embedding=embeddings, persist_directory=persist_directory
                    )
                    logger.info(f"Rebuilt vector store with {len(chunks)} document chunks.")
                else:
                    logger.info(f"Loaded vector store with {vector_store._collection.count()} embeddings.")
            except Exception as e:
                logger.warning(f"Failed to load existing vector store: {e}. Rebuilding...")

        # If no vector store exists or is empty, create a new one
        if not vector_store:
            logger.info(f"Creating a new vector store with {len(chunks)} document chunks.")
            vector_store = Chroma.from_documents(
                documents=chunks, embedding=embeddings, persist_directory=persist_directory
            )

        logger.info("Vector store initialized successfully.")
        return vector_store

    except Exception as e:
        logger.exception("Error initializing vector store.")
        raise


def setup_retriever(vector_store: Chroma, top_k=3):
    """
    Configures a retriever for the vector store with an embedding function.

    Args:
        vector_store (Chroma): The vector database instance.
        top_k (int): Number of top results to retrieve.

    Returns:
        Retriever: A retriever configured for the vector store.
    """
    try:
        if not vector_store:
            raise ValueError("Vector store is empty or not initialized.")

        logger.info(f"Setting up retriever with top_k={top_k}...")
        retriever = vector_store.as_retriever(search_kwargs={"k": top_k})

        logger.info("Retriever configured successfully.")
        return retriever

    except Exception as e:
        logger.exception("Error setting up retriever.")
        raise


In [None]:
def load_model(
    model_name="microsoft/Phi-3-mini-4k-instruct",
    max_new_tokens=300,
    device="cuda",
    precision="float16",
    trust_remote_code=True
):
    """
    Loads a language model and sets up a text-generation pipeline.

    Args:
        model_name (str): Name of the pre-trained model to load.
        max_new_tokens (int): Maximum number of tokens to generate per response.
        device (str): Device to load the model on ("cuda" or "cpu").
        precision (str): Precision type ("float16", "float32").
        trust_remote_code (bool): Trust remote code for custom models.

    Returns:
        transformers.pipelines.Pipeline: A configured pipeline ready for text generation.
    """
    # Set random seed for reproducibility
    torch.manual_seed(0)

    try:
        # Validate max_new_tokens
        if not isinstance(max_new_tokens, int) or max_new_tokens <= 0 or max_new_tokens > 1024:
            raise ValueError("max_new_tokens must be a positive integer and less than or equal to 1024.")

        # Check device compatibility
        if device == "cuda" and not torch.cuda.is_available():
            logger.warning("CUDA is not available. Falling back to CPU.")
            device = "cpu"
            precision = "float32"

        # Determine the dtype based on precision and device
        dtype = torch.float16 if precision == "float16" and device == "cuda" else torch.float32

        # Log initialization parameters
        logger.info(f"Loading model '{model_name}' on {device} with precision '{precision}' and max_new_tokens={max_new_tokens}.")

        # Load the model and tokenizer
        model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=dtype, trust_remote_code=trust_remote_code
        ).to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
        tokenizer.pad_token = tokenizer.eos_token

        # Initialize the pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device == "cuda" else -1,
            max_new_tokens=max_new_tokens,
        )

        # Log model details
        model_params = sum(p.numel() for p in model.parameters())
        logger.info(f"Model '{model_name}' loaded successfully with {model_params:,} parameters.")

        return pipe

    except Exception as e:
        logger.exception(f"Failed to load model '{model_name}': {e}")
        raise RuntimeError(f"Failed to load model: {e}")


In [None]:
def load_and_preprocess_data(
    folder_path: str,
    metadata_path: str,
    chunk_size: int,
    chunk_overlap: int,
    required_metadata_fields: Set[str]
) -> List[Document]:
    """
    Streamlines the loading, metadata attachment, validation, and chunking process.

    Args:
        folder_path (str): Path to the folder containing documents.
        metadata_path (str): Path to the metadata CSV file.
        chunk_size (int): Maximum number of characters per chunk.
        chunk_overlap (int): Number of overlapping characters between chunks.
        required_metadata_fields (Set[str]): Required metadata fields for validation.

    Returns:
        Tuple[List[Document], List[Document]]: Enriched documents and their chunks.
    """
    try:
        logger.info("Starting the preprocessing pipeline...")

        # Step 1: Load metadata from the provided CSV file
        logger.info("Step 1: Loading metadata...")
        metadata_df = load_metadata(metadata_path)
        logger.info(f"Metadata loaded successfully with {len(metadata_df)} rows.")

        # Step 2: Load PDF documents from the specified folder
        logger.info("Step 2: Loading documents from the folder...")
        documents = load_documents(folder_path)
        if not documents:
            raise ValueError("No documents found in the specified folder.")
        logger.info(f"Loaded {len(documents)} documents from {folder_path}.")

        # Step 3: Attach metadata to each document
        logger.info("Step 3: Attaching metadata to documents...")
        enriched_documents = attach_metadata_to_documents(documents, metadata_df)
        logger.info(f"Metadata successfully attached to {len(enriched_documents)} documents.")

        # Step 4: Validate the metadata against required fields
        logger.info("Step 4: Validating metadata for enriched documents...")
        validated_documents = validate_metadata(enriched_documents, required_fields=required_metadata_fields)
        logger.info(f"Metadata validation completed for {len(validated_documents)} documents.")

        # Step 5: Split documents into token-based chunks and propagate metadata
        logger.info("Step 5: Splitting documents into chunks...")
        document_chunks = propagate_metadata_to_chunks(
            validated_documents,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        logger.info(f"Generated {len(document_chunks)} document chunks.")

        # Final Step: Log completion and return results
        logger.info("Preprocessing pipeline completed successfully.")
        return enriched_documents, document_chunks

    except FileNotFoundError as fnf_error:
        logger.error(f"File not found: {fnf_error}")
        raise  # Specific actionable error, such as missing folder or file.
    except Exception as e:
        logger.error(f"An error occurred during preprocessing: {e}")
        raise

In [None]:
def get_model_token_limit(model_name: str) -> int:
    """
    Dynamically retrieves the token limit of a model based on its configuration.

    Args:
        model_name (str): The name or path of the pre-trained model.

    Returns:
        int: The maximum token limit of the model.
    """
    try:
        # Load the model configuration
        model = AutoModelForCausalLM.from_pretrained(model_name)
        if hasattr(model.config, "max_position_embeddings"):
            return model.config.max_position_embeddings
        else:
            raise ValueError(f"Model {model_name} does not specify 'max_position_embeddings'.")
    except Exception as e:
        raise RuntimeError(f"Failed to retrieve token limit for model '{model_name}': {e}")


### RAG Pipeline Implementation

The Retrieval-Augmented Generation (RAG) pipeline is an advanced framework designed to enhance natural language processing by combining retrieval-based document search with generative language models. This approach provides highly accurate and context-aware responses to user queries by integrating the following components:

- Document Retrieval
- Generative Language Model
- Metadata and Context Management
- Sentiment Analysis
- Summarisation
- Topic Modeling

**configure_rag_pipeline**: this function serves as the backbone for applications requiring accurate document retrieval, summarization, sentiment classification, and topic analysis, making it an all-in-one solution for retrieval-augmented generation tasks.

Workflow
- Document Preprocessing: Load and enrich documents, then split them into manageable chunks.
- Embedding Generation: Convert document chunks into vector embeddings and store them in a ChromaDB vector store.
- Retriever Setup: Configure a retriever to fetch the most relevant chunks for user queries.
- Generative Responses: Use retrieved chunks to generate responses via a language model.
- Auxiliary Insights: Apply summarization, sentiment analysis, and topic modeling for deeper understanding of the data.

In [None]:
def configure_rag_pipeline(
    folder_path: str,
    metadata_path: str,
    model_name: str = "microsoft/Phi-3-mini-4k-instruct",
    embeddings_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
    model_token_limit: int = 4096,
    top_k: int = 3,
    max_new_tokens: int = 500,
    chunk_size: int = None,  # Dynamically calculated based on model token limit
    chunk_overlap: int = 200,
    required_metadata_fields: Set[str] = None,
    persist_directory: str = "qa_index",
    device: str = "cuda",
) -> dict:
    """
    Configures a Retrieval-Augmented Generation (RAG) pipeline.

    Args:
        folder_path (str): Path to the folder containing documents.
        metadata_path (str): Path to the metadata CSV file.
        model_name (str): Default language model to load.
        embeddings_model_name (str): The embeddings model to use for vector store creation.
        top_k (int): Number of top results to retrieve.
        max_new_tokens (int): Maximum number of tokens to generate.
        chunk_size (int): Maximum number of characters in each document chunk.
        chunk_overlap (int): Number of overlapping characters between chunks.
        required_metadata_fields (Set[str]): Set of required metadata fields for validation.
        persist_directory (str): Directory for the vector store.
        device (str): Device for model loading (e.g., "cuda" or "cpu").

    Returns:
        dict: A dictionary containing configured pipeline components.
    """
    try:
        # Dynamically fetch the model's token limit
        model_token_limit = get_model_token_limit(model_name)
        logger.info(f"Model token limit for '{model_name}': {model_token_limit}")

        # Set default required metadata fields if not provided
        if required_metadata_fields is None:
            required_metadata_fields = {"year", "quarter", "bank", "source", "designation", "name"}

        # Calculate dynamic chunk size if not specified
        if chunk_size is None:
            prompt_tokens = 200  # Reserve tokens for the prompt
            token_budget = model_token_limit - prompt_tokens - max_new_tokens
            chunk_size = min(2000, token_budget // 2)  # Use half the remaining tokens per chunk
            logger.info(
                f"Dynamic chunk size calculated based on model_token_limit={model_token_limit}, "
                f"prompt_tokens={prompt_tokens}, max_new_tokens={max_new_tokens}: {chunk_size}"
            )

        # Step 1: Load and preprocess documents
        logger.info("Starting document preprocessing...")
        enriched_documents, document_chunks = load_and_preprocess_data(
            folder_path=folder_path,
            metadata_path=metadata_path,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            required_metadata_fields=required_metadata_fields,
        )
        if not enriched_documents:
            logger.error("No documents were enriched. Check metadata and folder path.")
            return None

        if not document_chunks:
            logger.error("No document chunks generated. Check text splitting or preprocessing logic.")
            return None

        # Log enriched document metadata for verification
        logger.info("Enriched document metadata (preview):")
        for i, doc in enumerate(enriched_documents[:3]):  # Preview the first 3 documents
            logger.info(f"Document {i + 1}: {doc.metadata}")

        # Step 2: Initialize embeddings and vector store
        logger.info("Initializing embeddings and vector store...")
        vector_store = initialize_embeddings_and_vector_store(
            chunks=document_chunks,
            embeddings_model_name=embeddings_model_name,
            persist_directory=persist_directory,
            use_existing=True,
        )

        if not vector_store or vector_store._collection.count() == 0:
            logger.warning("Vector store is empty. Consider rebuilding with valid document chunks.")
            return None

        # Step 3: Set up retriever
        logger.info("Setting up retriever...")
        retriever = setup_retriever(vector_store, top_k=top_k)
        if not retriever:
            logger.error("Failed to configure retriever. Exiting pipeline configuration.")
            return None

        # Step 4: Load the language model
        logger.info("Loading the language model...")
        llm_pipeline = load_model(
            model_name=model_name,
            max_new_tokens=max_new_tokens,
            device=device,
        )

        if not llm_pipeline:
            logger.error("Failed to load the language model. Exiting pipeline configuration.")
            return None

        # Step 5: Initialize QA chains
        logger.info("Initializing QA chains...")
        qa_chains = initialize_chains(llm_pipeline, "stuff")

        if not qa_chains:
            logger.error("Failed to initialize QA chains. Exiting pipeline configuration.")
            return None

        # Step 6: Load metadata for display and options
        logger.info("Loading metadata...")
        metadata_df = pd.read_csv(metadata_path)
        if metadata_df.empty:
            logger.warning("Metadata file is empty. Ensure the metadata CSV is correctly populated.")
        else:
            logger.info(f"Loaded metadata with {len(metadata_df)} rows.")

        # Step 7: Set up the summarization pipeline
        logger.info("Initializing summarization pipeline...")
        summarization_pipeline = hf_pipeline(
            "summarization",
            model="sshleifer/distilbart-cnn-6-6",  # Or any suitable summarization model
            device=0 if device == "cuda" else -1  # Use GPU if available
        )

        # Step 8: Set up the sentiment analysis pipeline
        logger.info("Initializing sentiment analysis pipeline...")
        sentiment_pipeline = hf_pipeline(
            "sentiment-analysis",
            model="yiyanghkust/finbert-tone",  # Example sentiment analysis model
            device=0 if device == "cuda" else -1  # Use GPU if available
        )

        # Step 9: Initialize BERTopic for topic modeling
        logger.info("Initializing BERTopic model...")
        topic_model = BERTopic(language="english", verbose=True)
        logger.info("BERTopic model initialized successfully.")

        # Return all components in a dictionary
        logger.info("RAG pipeline configured successfully.")
        return {
            "retriever": retriever,
            "vector_store": vector_store,
            "llm_pipeline": llm_pipeline,
            "qa_chains": qa_chains,
            "metadata_df": metadata_df,
            "documents": enriched_documents,
            "summarization_pipeline": summarization_pipeline,
            "sentiment_pipeline": sentiment_pipeline,
            "topic_model": topic_model,
        }

    except Exception as e:
        logger.exception(f"An unexpected error occurred during pipeline configuration: {e}")
        return None


This pipeline can be enhanced by integrating topics and sentiments extracted via sentiment-modelling pipeline and topic-modelling pipeline. The same data can be added to the context for better and accurate results.

### RAG Pipeline and Other Helper Functions

Purpose: General-purpose functions to support RAG workflows, such as extracting responses, displaying text, and dynamic user interactions.
**Functions:**
- select_model
- extract_answer
- initialize_chains
- display_wrapped_output
- display_title
- display_query_menu
- get_filters

In [None]:
def select_model(model_name="microsoft/Phi-3-mini-4k-instruct"):
    """
    Allows the user to select a language model dynamically.

    Args:
        model_name (str): Default model name to use if the user skips selection.

    Returns:
        str: Name of the selected model or the default model.
    """

    # Define supported models
    supported_models = {
        "1": "microsoft/Phi-3-mini-4k-instruct",
        "2": "microsoft/Phi-3.5-MoE-instruct",
        "3": "gemini-1.5-flash-8b",
        "4": "openai/gpt-4"
    }

    # Display available models
    print("\nAvailable Language Models:")
    for key, model_name in supported_models.items():
        print(f"{key}. {model_name}")

    default_model = "microsoft/Phi-3-mini-4k-instruct"

    # Prompt user for selection
    while True:
        try:
            choice = input(
                f"\nSelect a model (1-{len(supported_models)}) or press Enter for default [{default_model}]: "
            ).strip()

            # Use default model if input is empty
            if not choice:
                logger.info(f"Default model selected: {model_name}")
                print(f"[INFO] Using default model: {model_name}")
                return model_name

            # Validate and return the selected model
            if choice in supported_models:
                selected_model = supported_models[choice]
                logger.info(f"User selected model: {selected_model}")
                print(f"[INFO] Selected model: {selected_model}")

                return selected_model

            # Handle invalid input
            logger.warning(f"Invalid choice entered: {choice}")
            print("[ERROR] Invalid choice. Please select a valid option.")
        except Exception as e:
            logger.error(f"An error occurred during model selection: {e}")
            print("[ERROR] An unexpected error occurred. Please try again.")

In [None]:
def extract_answer(response: dict, tag: str = "<|assistant|>") -> str:
    """
    Extracts and formats the answer from the response.

    Args:
        response (dict | str): The response dictionary or string from the QA chain.
        tag (str): The delimiter tag to locate the assistant's response.

    Returns:
        str: The extracted answer, or a default message if not found.
    """
    try:
        # Check if response is a string
        if isinstance(response, str):
            response_text = response
        elif isinstance(response, dict):
            response_text = response.get("output_text", "No response generated.")
        else:
            raise ValueError("Response must be a dictionary or a string.")

        if tag in response_text:
            # Extract the text after the assistant tag
            answer = response_text.split(tag)[-1].strip()
        else:
            # Default to the entire response or an error message
            answer = response_text.strip()
        return answer
    except Exception as e:
        logger.error(f"Error extracting answer: {e}")
        return "Error extracting the answer."


## Prompt Template

This function contains a dictionary of prompt templates that are integrated to the handlers to action the menu options.

In [None]:
def initialize_chains(llm_pipeline, chain_type="stuff"):
    """
    Initializes QA chains using predefined prompt templates for different menu options.

    Args:
        llm_pipeline: A Hugging Face pipeline wrapped for LangChain.
        chain_type (str): The chain type to use for initializing the QA chains.

    Returns:
        dict: Dictionary of QA chains, keyed by chain purpose.

    Raises:
        RuntimeError: If the chain initialization fails.
    """
    try:
        # Wrap the LLM pipeline for LangChain
        llm = HuggingFacePipeline(pipeline=llm_pipeline)

        # Define prompt templates
        prompt_templates = {
            "generic_question": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Your expertise lies in analyzing financial institutions with a focus on European banks.
                You have 20+ years of experience in financial modeling and due diligence.
                Only use the provided context and metadata to answer the question. Avoid any assumptions or unsupported claims.
                If information is missing or incomplete, state it clearly and suggest what additional data would help.

                Metadata:
                {metadata}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["metadata", "context", "question"]
            ),
            "compare_two_quarters_same_year": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Your goal is to identify key highlights, challenges, and strategic shifts between two quarters in same year.
                Use only the given context and metadata. If data is unavailable, state this explicitly.


                Based on the provided data:
                1. Summarize the key highlights of the company’s performance.
                2. Identify challenges or risks mentioned.
                3. Provide insights into future strategies or guidance.

                Provide the output as:
                1. Key Highlights:
                  - [Performance overview]
                  - [Growth areas]
                2. Challenges/Risks:
                  - [Key risks]
                3. Future Strategies:
                  - [Summary of plans or initiatives]

                Metadata:
                - Year: {year}
                - Quarter 1: {quarter1}
                - Quarter 2: {quarter2}
                - Bank: {bank}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["context", "question", "year", "quarter1", "quarter2", "bank"]
            ),
            "compare_two_quarters_diff_years": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Your goal is to identify key highlights, challenges, and strategic shifts across two quarters in different years.
                Use only the given context and metadata. If data is unavailable, state this explicitly.


                Based on the provided data:
                1. Summarize the key highlights of the company’s performance.
                2. Identify challenges or risks mentioned.
                3. Provide insights into future strategies or guidance.

                Provide the output as:
                1. Key Highlights:
                  - [Performance overview]
                  - [Growth areas]
                2. Challenges/Risks:
                  - [Key risks]
                3. Future Strategies:
                  - [Summary of plans or initiatives]

                Metadata:
                - Quarter 1: {quarter1} ({year1})
                - Quarter 2: {quarter2} ({year2})
                - Bank: {bank}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["context", "question", "year1", "year2", "quarter1", "quarter2", "bank"]
            ),
            "year_comparison": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Your goal is to identify key highlights, challenges, and strategic shifts across years.
                Use only the given context and metadata. If data is unavailable, state this explicitly.


                Based on the provided data:
                1. Summarize the key highlights of the company’s performance.
                2. Identify challenges or risks mentioned.
                3. Provide insights into future strategies or guidance.

                Provide the output as:
                1. Key Highlights:
                  - [Performance overview]
                  - [Growth areas]
                2. Challenges/Risks:
                  - [Key risks]
                3. Future Strategies:
                  - [Summary of plans or initiatives]

                Metadata:
                - Year 1: {year1}
                - Year 2: {year2}
                - Bank: {bank}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["context", "question", "year1", "year2", "bank"]
            ),
            "all_quarters_same_year": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Your goal is to identify key highlights, challenges, and strategic shifts across all quarters in a year.
                Use only the given context and metadata. If data is unavailable, state this explicitly.


                Based on the provided data:
                1. Summarize the key highlights of the company’s performance.
                2. Identify challenges or risks mentioned.
                3. Provide insights into future strategies or guidance.

                Provide the output as:
                1. Key Highlights:
                  - [Performance overview]
                  - [Growth areas]
                2. Challenges/Risks:
                  - [Key risks]
                3. Future Strategies:
                  - [Summary of plans or initiatives]

                Metadata:
                - Year: {year}
                - Bank: {bank}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["context", "question", "year", "bank"]
            ),
            "single_quarter_sentiment": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Determine and analyze sentiment for a single quarter.
                Provide insights into the sentiment, including whether it is positive, negative, or neutral, and why.

                Metadata:
                - Year: {year}
                - Quarter: {quarter}
                - Bank: {bank}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["context", "question", "year", "quarter", "bank"]
            ),
            "year_sentiment_trends": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Determine and analyze sentiment changes over a year.
                Provide insights into the sentiment, including whether it is positive, negative, or neutral, and why.
                Identify shifts in sentiment across quarters and provide possible reasons.

                Metadata:
                - Year: {year}
                - Bank: {bank}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["context", "question", "year", "bank"]
            ),
            "summarize_single_quarter": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Summarize the main points of a single quarter's earnings call data.
                Include highlights and key performance indicators.

                Metadata:
                - Year: {year}
                - Quarter: {quarter}
                - Bank: {bank}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["context", "question", "year", "quarter", "bank"]
            ),
            "aggregated_summaries": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Summarize the financial performance and sentiment.
                Include key trends, highlights, and areas of concern.

                Metadata:
                {metadata}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["metadata", "context", "question"]
            ),
            "trend_analysis": PromptTemplate(
                template="""<|system|>
                You are a senior investment analyst at a major hedge fund.
                Analyze trends in financial metrics over time.
                Highlight significant changes and provide actionable insights.

                Metadata:
                {metadata}

                Context:
                {context}

                Question:
                {question}

                <|assistant|>""",
                input_variables=["metadata", "context", "question"]
            ),
        }

        # Validate templates
        for key, template in prompt_templates.items():
            missing_vars = [var for var in template.input_variables if f"{{{var}}}" not in template.template]
            unused_vars = [
                var.split("}")[0]
                for var in template.template.split("{")
                if "}" in var and var.split("}")[0] not in template.input_variables
            ]

            if missing_vars:
                logger.warning(f"Template '{key}' is missing variables: {missing_vars}")
            if unused_vars:
                logger.warning(f"Template '{key}' contains unused variables: {unused_vars}")
            if not missing_vars and not unused_vars:
                logger.info(f"Template '{key}' validated successfully.")

        # Validate chain_type
        valid_chain_types = ["stuff", "map_reduce", "refine"]
        if chain_type not in valid_chain_types:
            raise ValueError(f"Invalid chain_type '{chain_type}'. Valid options are: {valid_chain_types}")

        # Create chains dynamically
        chains = {
            key: load_qa_chain(llm, chain_type=chain_type, prompt=prompt)
            for key, prompt in prompt_templates.items()
        }

        logger.info("QA chains initialized successfully.")
        return chains

    except ValueError as ve:
        logger.error(f"Validation error: {ve}")
        raise RuntimeError(f"Failed to initialize QA chains: {ve}")
    except Exception as e:
        logger.exception("An unexpected error occurred while initializing QA chains.")
        raise RuntimeError(f"Failed to initialize QA chains: {e}")


In [None]:
def get_filters():
    filters = {}
    filter_bank = input("Do you want to filter by a specific bank? (yes/no): ").strip().lower()
    if filter_bank == "yes":
        filters["bank"] = input("Enter the bank name: ").strip().lower()
    year = input("Enter the year to filter (or press Enter to skip): ").strip()
    if year:
        filters["year"] = year
    quarter = input("Enter the quarter to filter (e.g., Q1, or press Enter to skip): ").strip()
    if quarter:
        filters["quarter"] = quarter
    return filters


## Context Preparation

Purpose: Functions to prepare text and metadata context dynamically based on token limits and requirements for answering queries.

**Functions:**
- prepare_context_with_dynamic_chunking
- prepare_context_with_metadata
- prepare_context_with_limit
- filter_documents

In [None]:
def prepare_context_with_dynamic_chunking(
    documents: List[Document],
    summarization_pipeline: pipeline,
    tokenizer: AutoTokenizer,
    model_token_limit: int,
    prompt_tokens: int,
    max_new_tokens: int,
    bank: str
) -> str:
    """
    Prepares the context by dynamically chunking documents and summarizing if necessary.

    Args:
        documents (List[Document]): List of relevant documents.
        summarization_pipeline (pipeline): Summarization model pipeline.
        tokenizer (AutoTokenizer): Tokenizer for token estimation.
        model_token_limit (int): Maximum token limit for the model.
        prompt_tokens (int): Number of tokens reserved for the prompt.
        max_new_tokens (int): Number of tokens reserved for the model's response.
        bank (str): The bank name to include in the context.

    Returns:
        str: The prepared context string.
    """
    # Calculate the remaining token budget for context
    context_token_budget = model_token_limit - prompt_tokens - max_new_tokens
    logger.info(f"Context token budget: {context_token_budget}")

    # Initialize context
    context = ""
    for doc in documents:
        try:
            # Tokenize the document content
            tokens = tokenizer.encode(doc.page_content, truncation=False)
            token_count = len(tokens)

            # Check if the document fits within the budget
            if token_count <= context_token_budget:
                context += doc.page_content + "\n"
                context_token_budget -= token_count
            else:
                # Summarize if the document exceeds the token budget
                logger.info(f"Summarizing document with {token_count} tokens...")
                summary = summarization_pipeline(
                    doc.page_content, max_length=200, min_length=50, do_sample=False
                )
                summary_text = summary[0]["summary_text"]
                summary_tokens = tokenizer.encode(summary_text, truncation=False)

                if len(summary_tokens) <= context_token_budget:
                    context += summary_text + "\n"
                    context_token_budget -= len(summary_tokens)
                else:
                    logger.warning("Summarized content still exceeds token budget. Skipping this document.")
        except Exception as e:
            logger.error(f"Error processing document for context: {e}")
            continue

    return context.strip()

def prepare_context_with_metadata(documents, metadata_df, filters):
    filtered_metadata = metadata_df[
        (metadata_df["Bank"].str.lower() == filters["bank"].lower()) &
        (metadata_df["Year"].str.lower() == filters["year"].lower()) &
        (metadata_df["Quarter"].str.lower() == filters["quarter"].lower())
    ]
    metadata_context = "\n".join(
        [f"{row['Name']} ({row['Designation']}), {row['Bank']} {row['Quarter']} {row['Year']}" for _, row in filtered_metadata.iterrows()]
    )
    return metadata_context


def prepare_context_with_limit(
    documents: List[Document],
    tokenizer,
    summarization_pipeline=None,
    token_limit: int = 4000,
    prompt_tokens: int = 200,
    max_new_tokens: int = 500
) -> str:
    """
    Prepares the context for the LLM by including entire documents or summaries if they exceed the token limit.

    Args:
        documents (List[Document]): List of Document objects.
        tokenizer: Tokenizer for token estimation.
        summarization_pipeline: Optional summarization pipeline for condensing long documents.
        token_limit (int): Maximum token limit for the model.
        prompt_tokens (int): Estimated token count for the prompt.
        max_new_tokens (int): Estimated token count for the model's response.

    Returns:
        str: Prepared context string within the token budget.
    """
    # Calculate the context token budget
    context_token_limit = token_limit - prompt_tokens - max_new_tokens
    logger.info(f"Context token budget: {context_token_limit} tokens.")

    context = ""

    for doc in documents:
        try:
            # Calculate token count for the document content
            doc_tokens = len(tokenizer.encode(doc.page_content, truncation=False))
            logger.info(f"Document token count: {doc_tokens}")

            # If the document fits within the token budget, add it directly
            if doc_tokens <= context_token_limit:
                context += doc.page_content + "\n"
                context_token_limit -= doc_tokens
            else:
                # Summarize if the document exceeds the token limit and summarization is available
                if summarization_pipeline:
                    logger.info(f"Summarizing document with {doc_tokens} tokens...")
                    summary = summarization_pipeline(
                        doc.page_content, max_length=200, min_length=50, do_sample=False
                    )
                    summary_text = summary[0]["summary_text"]
                    summary_tokens = len(tokenizer.encode(summary_text, truncation=False))

                    if summary_tokens <= context_token_limit:
                        context += summary_text + "\n"
                        context_token_limit -= summary_tokens
                    else:
                        logger.warning("Summarized content still exceeds token limit. Skipping document.")
                else:
                    logger.warning("No summarization pipeline provided. Skipping document.")
        except Exception as e:
            logger.error(f"Error processing document for context: {e}")
            continue


    return context.strip()

def prepare_context_with_topics_and_sentiments(
    documents: List[Document],
    tokenizer,
    summarization_pipeline=None,
    sentiment_pipeline=None,
    lda_model=None,
    dictionary=None,
    bow_corpus=None,
    num_topics=3,
    token_limit: int = 4000,
    prompt_tokens: int = 200,
    max_new_tokens: int = 500
):
    """
    This function is for future use to integrate LDA and sentiment analysis.
    Prepares the context for the LLM by including document content, summaries, LDA topics, and sentiment analysis

    Args:
        documents (List[Document]): List of Document objects.
        tokenizer: Tokenizer for token estimation.
        summarization_pipeline: Optional summarization pipeline for condensing long documents.
        sentiment_pipeline: Hugging Face sentiment-analysis pipeline.
        lda_model (LdaModel): Trained LDA model for topic extraction.
        dictionary (Dictionary): Gensim dictionary for the corpus.
        bow_corpus (list): Bag-of-words representation of the corpus.
        num_topics (int): Number of topics to extract for each document.
        token_limit (int): Maximum token limit for the model.
        prompt_tokens (int): Estimated token count for the prompt.
        max_new_tokens (int): Estimated token count for the model's response.

    Returns:
        str: Prepared context string within the token budget.
    """
    context_token_limit = token_limit - prompt_tokens - max_new_tokens
    logger.info(f"Context token budget: {context_token_limit} tokens.")

    context = ""
    topic_context = ""
    sentiment_context = ""

    # Step 1: Extract topics using LDA
    if lda_model and dictionary and bow_corpus:
        logger.info("Extracting topics from LDA model...")
        topics_for_documents = extract_topics_from_lda(lda_model, bow_corpus, dictionary, num_topics)
        topic_context = generate_topic_context(topics_for_documents)

    # Step 2: Perform sentiment analysis
    if sentiment_pipeline:
        logger.info("Analyzing sentiments...")
        sentiments = []
        for doc in documents:
            try:
                sentiment_result = sentiment_pipeline(doc.page_content)
                sentiment_label = sentiment_result[0]['label']
                sentiment_score = sentiment_result[0]['score']
                sentiments.append(f"{doc.metadata.get('source', 'Unknown')}: {sentiment_label} (Confidence: {sentiment_score:.2f})")
            except Exception as e:
                logger.error(f"Error during sentiment analysis for document: {e}")
        sentiment_context = "\n".join(sentiments)

    # Step 3: Add document content or summaries
    for doc in documents:
        try:
            # Tokenize document content
            doc_tokens = len(tokenizer.encode(doc.page_content, truncation=False))
            logger.info(f"Document token count: {doc_tokens}")

            # Add document content if within token budget
            if doc_tokens <= context_token_limit:
                context += doc.page_content + "\n"
                context_token_limit -= doc_tokens
            else:
                # Summarize if content exceeds token budget
                if summarization_pipeline:
                    logger.info(f"Summarizing document with {doc_tokens} tokens...")
                    summary = summarization_pipeline(
                        doc.page_content, max_length=200, min_length=50, do_sample=False
                    )
                    summary_text = summary[0]["summary_text"]
                    summary_tokens = len(tokenizer.encode(summary_text, truncation=False))

                    if summary_tokens <= context_token_limit:
                        context += summary_text + "\n"
                        context_token_limit -= summary_tokens
                    else:
                        logger.warning("Summarized content still exceeds token limit. Skipping document.")
                else:
                    logger.warning("No summarization pipeline provided. Skipping document.")
        except Exception as e:
            logger.error(f"Error processing document for context: {e}")
            continue

    # Step 4: Append topic and sentiment context if space allows
    topic_tokens = len(tokenizer.encode(topic_context, truncation=False))
    if topic_tokens <= context_token_limit:
        context += "\n" + topic_context
        context_token_limit -= topic_tokens
    else:
        logger.warning("Topic context exceeds token limit. Skipping topic context.")

    sentiment_tokens = len(tokenizer.encode(sentiment_context, truncation=False))
    if sentiment_tokens <= context_token_limit:
        context += "\n" + sentiment_context
    else:
        logger.warning("Sentiment context exceeds token limit. Skipping sentiment context.")

    return context.strip()


### Filter function

This function filters number of documents based on metadata to prepare the context for the query.

In [None]:

def filter_documents(documents: List[Document], bank: str = None, year: str = None, quarter: str = None, designation: str = None) -> List[Document]:
    """
    Filters documents by bank, year, quarter, and optionally designation, ensuring metadata keys and values are normalized.

    Args:
        documents (List[Document]): List of Document objects with metadata.
        bank (str, optional): Bank name to filter by.
        year (str, optional): Year to filter by.
        quarter (str, optional): Quarter to filter by (e.g., Q1, Q2).
        designation (str, optional): Designation to filter by (e.g., CFO). Default is None.

    Returns:
        List[Document]: Filtered list of documents.
    """
    try:
        logger.info(f"Starting filtering with criteria bank: {bank}, year: {year}, quarter: {quarter}, designation: {designation}")

        filtered_docs = []
        for doc in documents:
            # Normalize metadata keys and values
            metadata = {key.lower(): str(value).strip().lower() for key, value in doc.metadata.items()}
            logger.debug(f"Document Metadata: {metadata}")

            # Extract metadata fields for filtering
            doc_bank = metadata.get("bank", "")
            doc_year = metadata.get("year", "")
            doc_quarter = metadata.get("quarter", "")
            doc_designation = metadata.get("designation", "")

            # Apply strict matching criteria
            bank_match = not bank or doc_bank == bank.strip().lower()
            year_match = not year or doc_year == str(year).strip().lower()
            quarter_match = not quarter or doc_quarter == quarter.strip().lower()
            designation_match = not designation or doc_designation == designation.strip().lower()

            # Log the match results for debugging
            logger.debug(f"Bank Match: {bank_match}, Year Match: {year_match}, Quarter Match: {quarter_match}, Designation Match: {designation_match}")

            if bank_match and year_match and quarter_match and designation_match:
                filtered_docs.append(doc)

        logger.info(f"Filtered {len(filtered_docs)} documents for Bank: {bank}, Year: {year}, Quarter: {quarter}, Designation: {designation}")
        return filtered_docs

    except Exception as e:
        logger.error(f"Error filtering documents: {e}")
        raise

## RAG Query Handling

Purpose: These functions handle user queries by retrieving relevant documents, preparing context, and invoking the appropriate QA chain.

**Functions:**

- handle_generic_query
- handle_compare_quarters_same_year
- handle_compare_quarters_diff_years
- handle_year_comparison
- handle_sentiment_single_quarter
- handle_summarize_single_quarter

In [None]:
def handle_generic_query(pipeline, tokenizer, summarization_pipeline, model_token_limit, filters, query):
    """
    Handles generic queries with optional filters and retrieves relevant documents.
    """
    logger.info("Handling a generic query.")

    try:
        # Step 1: Apply filters if provided
        logger.info("Retrieving relevant documents with filters...")
        relevant_docs = pipeline["retriever"].get_relevant_documents(query)
        if filters.get("bank"):
            relevant_docs = [
                doc for doc in relevant_docs if doc.metadata.get("bank", "").lower() == filters["bank"].lower()
            ]

        if not relevant_docs:
            logger.warning("No documents found matching the query criteria.")
            return "[ERROR] No documents found for the query."

        logger.info(f"Filtered {len(relevant_docs)} documents for the generic query.")

        # Step 2: Prepare context
        logger.info("Preparing context for the query...")
        context = prepare_context_with_limit(
            documents=relevant_docs,
            tokenizer=tokenizer,
            summarization_pipeline=summarization_pipeline,
            token_limit=model_token_limit,
            prompt_tokens=200,
            max_new_tokens=500,
        )

        if not context.strip():
            logger.error("No relevant context could be prepared within the token limit.")
            return "[ERROR] Unable to prepare context."

        # Step 3: Ensure relevant_docs are in Document format
        input_documents = [
            doc if isinstance(doc, Document) else Document(page_content=doc["page_content"], metadata=doc["metadata"])
            for doc in relevant_docs
        ]

        # Step 4: Invoke the QA chain
        logger.info("Invoking the QA chain...")
        response = pipeline["qa_chains"]["generic_question"].invoke(
            {
                "input_documents": input_documents,
                "metadata": f"Filters: {filters}",
                "context": context,
                "question": query,
            },
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
        )

        # Extract and return the response
        return extract_answer(response)

    except Exception as e:
        logger.exception(f"Error handling the generic query: {e}")
        return "[ERROR] An unexpected error occurred while handling the query."


def handle_compare_quarters_same_year(pipeline, tokenizer, model_token_limit, year, quarter1, quarter2, bank, query):
    logger.info(f"Comparing {quarter1} and {quarter2} in {year} for {bank}.")
    try:
        # Retrieve documents for Q1
        filters_q1 = {"year": year, "quarter": quarter1, "bank": bank.lower()}
        relevant_docs_q1 = pipeline["retriever"].get_relevant_documents(query, filters=filters_q1)

        # Retrieve documents for Q2
        filters_q2 = {"year": year, "quarter": quarter2, "bank": bank.lower()}
        relevant_docs_q2 = pipeline["retriever"].get_relevant_documents(query, filters=filters_q2)

        # Combine documents
        input_documents = relevant_docs_q1 + relevant_docs_q2

        if not input_documents:
            logger.warning("No relevant documents found for the comparison.")
            return "[ERROR] No relevant documents found for the comparison."

        # Prepare metadata and invoke QA chain
        metadata = f"Year: {year}, Bank: {bank}, Quarter 1: {quarter1}, Quarter 2: {quarter2}"
        response = pipeline["qa_chains"]["compare_two_quarters_same_year"].invoke(
            {
                "input_documents": input_documents,
                "metadata": metadata,
                "context": "N/A",
                "question": query,
                "year": year,
                "quarter1": quarter1,
                "quarter2": quarter2,
                "bank": bank,
            }
        )
        return extract_answer(response)

    except Exception as e:
        logger.exception(f"Error comparing two quarters in the same year: {e}")
        return "[ERROR] An unexpected error occurred."



def handle_compare_quarters_diff_years(pipeline, tokenizer, model_token_limit, year1, year2, quarter1, quarter2, bank, query):
    """
    Handles comparison of two quarters across different years.
    """
    logger.info(f"Comparing {quarter1} ({year1}) and {quarter2} ({year2}) for {bank}.")

    try:
        # Retrieve documents for Quarter 1 of Year 1
        filters = {"year": year1, "quarter": quarter1, "bank": bank.lower()}
        logger.info(f"Retrieving documents for {quarter1} {year1}...")
        relevant_docs_q1 = pipeline["retriever"].get_relevant_documents(query, filters=filters)

        # Retrieve documents for Quarter 2 of Year 2
        filters["year"] = year2
        filters["quarter"] = quarter2
        logger.info(f"Retrieving documents for {quarter2} {year2}...")
        relevant_docs_q2 = pipeline["retriever"].get_relevant_documents(query, filters=filters)

        # Combine documents from both quarters
        input_documents = relevant_docs_q1 + relevant_docs_q2

        if not input_documents:
            logger.warning("No relevant documents found for the comparison.")
            return "[ERROR] No relevant documents found for the comparison."

        # Prepare metadata
        metadata = f"Bank: {bank}, Quarter 1: {quarter1} ({year1}), Quarter 2: {quarter2} ({year2})"

        # Invoke the QA chain
        response = pipeline["qa_chains"]["compare_two_quarters_diff_years"].invoke(
            {
                "input_documents": input_documents,
                "metadata": metadata,
                "context": "N/A",
                "question": query,
                "year1": year1,
                "year2": year2,
                "quarter1": quarter1,
                "quarter2": quarter2,
                "bank": bank,
            }
        )

        # Extract and return the response
        return extract_answer(response)

    except Exception as e:
        logger.exception(f"Error comparing two quarters across different years: {e}")
        return "[ERROR] An unexpected error occurred."



def handle_year_comparison(pipeline, tokenizer, model_token_limit, year1, year2, bank, query):
    """
    Handles year-over-year comparison.
    """
    logger.info(f"Comparing performance for {year1} and {year2} for {bank}.")

    try:
        # Retrieve documents for Year 1
        filters = {"year": year1, "bank": bank.lower()}
        logger.info(f"Retrieving documents for {year1}...")
        relevant_docs_y1 = pipeline["retriever"].get_relevant_documents(query, filters=filters)

        # Retrieve documents for Year 2
        filters["year"] = year2
        logger.info(f"Retrieving documents for {year2}...")
        relevant_docs_y2 = pipeline["retriever"].get_relevant_documents(query, filters=filters)

        # Combine documents from both years
        input_documents = relevant_docs_y1 + relevant_docs_y2

        if not input_documents:
            logger.warning("No relevant documents found for the comparison.")
            return "[ERROR] No relevant documents found for the comparison."

        # Prepare metadata
        metadata = f"Year 1: {year1}, Year 2: {year2}, Bank: {bank}"

        # Invoke the QA chain
        response = pipeline["qa_chains"]["year_comparison"].invoke(
            {
                "input_documents": input_documents,
                "metadata": metadata,
                "context": "N/A",
                "question": query,
                "year1": year1,
                "year2": year2,
                "bank": bank,
            }
        )

        # Extract and return the response
        return extract_answer(response)

    except Exception as e:
        logger.exception(f"Error performing year-over-year comparison: {e}")
        return "[ERROR] An unexpected error occurred."


def handle_sentiment_single_quarter(pipeline, tokenizer, model_token_limit, year, quarter, bank, query):
    """
    Handles sentiment analysis for a single quarter.
    """
    logger.info(f"Analyzing sentiment for {bank} in {quarter} {year}.")

    try:
        # Retrieve documents for the given quarter
        filters = {"year": year, "quarter": quarter, "bank": bank.lower()}
        logger.info(f"Retrieving documents for {quarter} {year}...")
        input_documents = pipeline["retriever"].get_relevant_documents(query, filters=filters)

        if not input_documents:
            logger.warning("No relevant documents found for sentiment analysis.")
            return "[ERROR] No relevant documents found for sentiment analysis."

        # Prepare metadata
        metadata = f"Year: {year}, Quarter: {quarter}, Bank: {bank}"

        # Invoke the QA chain
        response = pipeline["qa_chains"]["single_quarter_sentiment"].invoke(
            {
                "input_documents": input_documents,
                "metadata": metadata,
                "context": "N/A",
                "question": query,
                "year": year,
                "quarter": quarter,
                "bank": bank,
            }
        )

        # Extract and return the response
        return extract_answer(response)

    except Exception as e:
        logger.exception(f"Error analyzing sentiment for a single quarter: {e}")
        return "[ERROR] An unexpected error occurred."


def handle_summarize_single_quarter(pipeline, tokenizer, model_token_limit, year, quarter, bank, query):
    """
    Handles summarization for a single quarter.
    """
    logger.info(f"Summarizing performance for {bank} in {quarter} {year}.")

    try:
        # Retrieve documents for the given quarter
        filters = {"year": year, "quarter": quarter, "bank": bank.lower()}
        logger.info(f"Retrieving documents for {quarter} {year}...")
        input_documents = pipeline["retriever"].get_relevant_documents(query, filters=filters)

        if not input_documents:
            logger.warning("No relevant documents found for summarization.")
            return "[ERROR] No relevant documents found for summarization."

        # Prepare metadata
        metadata = f"Year: {year}, Quarter: {quarter}, Bank: {bank}"

        # Invoke the QA chain
        response = pipeline["qa_chains"]["summarize_single_quarter"].invoke(
            {
                "input_documents": input_documents,
                "metadata": metadata,
                "context": "N/A",
                "question": query,
                "year": year,
                "quarter": quarter,
                "bank": bank,
            }
        )

        # Extract and return the response
        return extract_answer(response)

    except Exception as e:
        logger.exception(f"Error summarizing the single quarter: {e}")
        return "[ERROR] An unexpected error occurred while summarizing the quarter."


# ==============================Topic Modeling=======================================

def handle_bertopic(documents, nlp_model=None):
    """
    Perform BERTopic modeling on earnings call transcripts.

    Args:
        documents (List[Document]): A list of filtered documents to perform topic modeling on.
        nlp_model (spacy.Language): Spacy NER model for removing names (PERSON entities).
    """
    logger.info("Starting BERTopic modeling for earnings call transcripts...")

    try:
        # Combine all document content into a single corpus
        corpus = [doc.page_content for doc in documents if doc.page_content]

        if not corpus:
            logger.warning("No valid content found in documents for BERTopic.")
            print("[ERROR] No valid content to analyze with BERTopic.")
            return

        # Preprocess the corpus for earnings call transcripts
        logger.info("Preprocessing the corpus for earnings calls...")
        redundant_terms = [
            "credit suisse", "q1", "q2", "q3", "q4",
            "first quarter", "second quarter", "third quarter", "fourth quarter",
            "earnings call", "company", "presentation", "analyst", "operator", "seeking_alpha"
        ]
        cleaned_corpus = preprocess_corpus_bertopic(
            corpus, nlp_model, custom_stopwords=redundant_terms, redundant_terms=redundant_terms
        )

        # Custom vectorizer for bigrams and trigrams with stopwords
        vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words="english")

        # Initialize BERTopic model with custom vectorizer
        topic_model = BERTopic(vectorizer_model=vectorizer)

        # Fit the model on the cleaned corpus
        logger.info("Fitting BERTopic model to the cleaned corpus...")
        topics, probs = topic_model.fit_transform(cleaned_corpus)

        # Check if any topics were identified
        topic_info = topic_model.get_topic_info()
        if topic_info.empty or topic_info["Count"].sum() == 0:
            logger.warning("No meaningful topics were identified by BERTopic.")
            print("[ERROR] BERTopic did not identify any meaningful topics. Check the input data.")
            return

        # Diagnostics: Display document lengths and topic info
        logger.info("Diagnostics:")
        doc_lengths = [len(doc.split()) for doc in cleaned_corpus]
        logger.info(f"Average document length: {sum(doc_lengths) / len(doc_lengths):.2f}")
        logger.info(f"Total number of documents: {len(cleaned_corpus)}")
        logger.info(f"Topic Info: \n{topic_info.head(6)}")

        # Display top topics
        logger.info("Displaying top topics for earnings calls...")
        print("\nTop Topics Identified by BERTopic:")
        print(topic_info.head(6))  # Display the top 6 topics for readability

        # Try visualizing topics if valid topics exist
        try:
            fig = topic_model.visualize_barchart(top_n_topics=6)
            fig.show()
        except Exception as e:
            logger.warning(f"Visualization failed: {e}")
            print("[WARNING] Unable to generate visualization. Skipping...")

        # Save the model for later use
        topic_model.save("bertopic_model_earnings_calls")
        logger.info("BERTopic model saved successfully.")

        print("[INFO] BERTopic modeling for earnings calls completed successfully.")
    except Exception as e:
        logger.exception(f"Error during BERTopic modeling: {e}")
        print(f"[ERROR] An error occurred during BERTopic modeling: {e}")


def handle_gensim_lda(documents, nlp_model=None, num_topics=3, extra_redundant_terms=None):
    """
    Perform LDA topic modeling on earnings call transcripts using Gensim, focusing on bigrams and trigrams.

    Args:
        documents (List[Document]): A list of filtered documents with a `page_content` attribute.
        nlp_model (spacy.Language): Spacy NER model for removing names (PERSON entities).
        num_topics (int): Number of topics to extract.
        extra_redundant_terms (list): Additional terms to remove during preprocessing.

    Returns:
        LdaModel, Dictionary, List: Trained LDA model, dictionary, and bow_corpus.
    """
    logger.info("Starting Gensim LDA modeling for earnings call transcripts...")

    try:
        # Combine all document content into a single corpus
        corpus = [doc.page_content for doc in documents if doc.page_content]

        if not corpus:
            logger.warning("No valid content found in documents for Gensim LDA.")
            print("[ERROR] No valid content to analyze with Gensim LDA.")
            return

        # Preprocess the corpus for earnings call transcripts
        logger.info("Preprocessing the corpus for earnings calls...")
        redundant_terms = [
            "credit suisse", "q1", "q2", "q3", "q4",
            "first quarter", "second quarter", "third quarter", "fourth quarter",
            "earnings call", "company", "presentation", "analyst", "operator", "thomas", "gottstein", "transcript_seeking alpha", "question", "next slide", "thank you",
            "take question", "transcript seeking alpha", "group ag cs results", "morning", "Transcript _ Seeking Alpha", "Group AG", "Results"
        ]

        # Add any extra redundant terms
        if extra_redundant_terms:
            redundant_terms.extend(extra_redundant_terms)

        processed_corpus = preprocess_corpus_gensim(
            corpus,
            nlp_model,
            custom_stopwords=redundant_terms,
            redundant_terms=redundant_terms,
        )

        # Create a dictionary and a bag-of-words representation of the corpus
        logger.info("Creating dictionary and bag-of-words representation...")
        dictionary = Dictionary(processed_corpus)
        bow_corpus = [dictionary.doc2bow(doc) for doc in processed_corpus]

        # Log dictionary size
        logger.info(f"Vocabulary size after preprocessing: {len(dictionary)}")

        # Train LDA model
        logger.info(f"Training Gensim LDA model with {num_topics} topics...")
        lda_model = LdaModel(
            corpus=bow_corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            passes=10,
            iterations=50,
        )

        # Display topics
        logger.info("Displaying top topics for earnings calls...")
        print("\nTop Topics Identified by Gensim LDA:")
        for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
            print(f"Topic #{idx + 1}: {topic}")

        # Visualize topics in Colab Notebook
        try:
            logger.info("Preparing visualization for LDA topics...")
            vis_data = gensimvis.prepare(lda_model, bow_corpus, dictionary)

            # Check if running in a Colab or IPython environment
            from IPython import get_ipython
            if "google.colab" in str(get_ipython()):  # Specifically for Google Colab
                from google.colab import output
                pyLDAvis.enable_notebook()
                display(vis_data)
            elif "IPython.core.interactiveshell" in str(type(get_ipython())):  # Jupyter notebook
                pyLDAvis.enable_notebook()
                display(vis_data)
            else:
                logger.warning("Visualization not supported in this environment. Saving as HTML.")
                pyLDAvis.save_html(vis_data, "gensim_lda_topics.html")

            logger.info("LDA visualization successfully displayed or saved.")
        except ImportError as e:
            logger.warning(f"pyLDAvis not installed or visualization failed: {e}")
            print("[WARNING] Unable to generate visualization. Skipping...")
        except Exception as e:
            logger.exception(f"Unexpected error during visualization: {e}")
            print("[ERROR] Unable to display visualization.")

        print("[INFO] Gensim LDA modeling for earnings calls completed successfully.")
        return lda_model, dictionary, bow_corpus

    except Exception as e:
        logger.exception(f"Error during Gensim LDA modeling: {e}")
        print(f"[ERROR] An error occurred during Gensim LDA modeling: {e}")



## Display Functions

In [None]:
import textwrap

def display_wrapped_output(output_text, width=50):
    """
    Displays the output text wrapped to fit within the specified width.

    Args:
        output_text (str): The text to display.
        width (int): The maximum width of each line.
    """
    wrapper = textwrap.TextWrapper(width=width)
    wrapped_text = wrapper.fill(output_text)
    print("[RESULT] Response from the AI Assistant:")
    print(wrapped_text)

def display_title():
    print("\n\n" + "=" * 70)
    print("   Welcome to the Earning Call Transcript-Based Risk Analyzer")
    print("             An Early Warning System for Investors")
    print("=" * 70)


def display_query_menu():
    """
    Displays a menu for selecting the query type and returns the user's choice.

    Returns:
        int: The selected numeric choice, or None if the input is invalid.
    """
    # Define the menu options
    menu_options = {
        "1": "Ask a generic question",
        "2": "Compare two quarters of the same year",
        "3": "Compare two quarters of different years",
        "4": "Year-over-year comparison",
        "5": "Analyze all quarters of a year",
        "6": "Analyze sentiment for a single quarter",
        "7": "Summarize a single quarter",
        "8": "Perform Topic Modeling with BERTopic",
        "9": "Perform Topic Modeling with Gensim LDA",
        "10": "Exit"
    }

    # Display the menu
    print("\n" + "=" * 50)
    print("   Query Selection Menu")
    print("=" * 50)
    for key, value in menu_options.items():
        print(f"{key}. {value}")
    print("=" * 50)

    # Get the user's choice
    choice = input("Select an option (1-10): ").strip()

    # Validate the choice and return as an integer
    if choice in menu_options:
        return int(choice)
    else:
        print("[ERROR] Invalid choice. Please enter a number between 1 and 10.")
        return None

def remove_repetitions(text):
    sentences = text.split('. ')
    seen = set()
    unique_sentences = []
    for sentence in sentences:
        if sentence not in seen:
            unique_sentences.append(sentence)
            seen.add(sentence)
    return '. '.join(unique_sentences)

## 'main' function

**Flow of main function**

1. **Initialize Logging**
   - Start the logger to track the application's workflow.

2. **Initialize Pipeline Configuration**
   - **Set File Paths**: Specify paths for document folder and metadata CSV.
   - **Select Model**: Prompt user to select a language model.
   - **Set Token Limits**: Retrieve the token limit for the selected model.
   - **Configure Pipeline**: Call `configure_rag_pipeline` to set up the RAG pipeline with:
     - Document preprocessing
     - Embedding initialization
     - Vector store setup
     - Summarization and sentiment analysis pipelines
     - LDA topic modeling
   - **Setup Retriever**: Add a retriever to the pipeline for retrieving relevant documents.
   - **Load Tokenizer**: Initialize tokenizer for the selected model.

3. **Define Helper Functions**
   - **`get_filters`**: Collect filter criteria for queries (e.g., bank, year, quarter).

4. **Interactive Query Interface**
   - Display a menu for users to choose an analysis type:
     1. **Ask a Generic Question**: Retrieve and analyze documents based on a user-provided question and filters.
     2. **Compare Quarters (Same Year)**: Compare performance between two quarters in the same year.
     3. **Compare Quarters (Different Years)**: Compare performance across two years for specific quarters.
     4. **Year-over-Year Comparison**: Analyze year-on-year performance for a specific bank.
     5. **Analyze All Quarters in a Year**: Summarize and analyze performance for all quarters in a year.
     6. **Analyze Sentiment for a Quarter**: Perform sentiment analysis for a specific quarter and bank.
     7. **Summarize a Quarter**: Summarize financial performance for a given quarter and bank.
     8. **Topic Modeling (BERTopic)**: Perform topic modeling on filtered documents using BERTopic.
     9. **Topic Modeling (Gensim LDA)**: Perform topic modeling on filtered documents using Gensim LDA.
    10. **Exit**: Exit the application.

5. **Process User Selection**
   - For each choice:
     - Prompt user for relevant inputs (e.g., year, quarter, bank).
     - Call the corresponding handler function (e.g., `handle_generic_query`, `handle_sentiment_single_quarter`).
     - Display results using `display_wrapped_output`.

6. **Error Handling**
   - Log and display errors encountered during pipeline configuration or query handling.

7. **Exit Application**
   - Exit gracefully when the user selects the exit option.

In [None]:
def main():
    """
    Main function to configure the RAG pipeline and start the interactive user interface.
    """
    logger = logging.getLogger("RAGApplication")
    logger.info("Starting Transcript-Based Risk Analyzer")

    # Step 1: Initialize pipeline, tokenizer, etc.
    try:
        folder_path = ""
        metadata_path = ""

        display_title()
        # Select model
        print("\nStep 1: Select a Language Model")
        selected_model = select_model()
        model_token_limit = get_model_token_limit(selected_model)

        # Configure pipeline
        print("\n[INFO] Configuring the RAG pipeline...")
        pipeline = configure_rag_pipeline(
            folder_path=folder_path,
            metadata_path=metadata_path,
            model_name=selected_model,
            embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_token_limit=model_token_limit,
            top_k=3,
            max_new_tokens=500,
            chunk_size=None,
            chunk_overlap=200,
            required_metadata_fields={"year", "quarter", "bank", "source", "designation", "name"},
            persist_directory="qa_index",
            device="cuda",
        )

        if pipeline is None:
            raise ValueError("Pipeline configuration returned no results.")

        # Configure retriever
        retriever = setup_retriever(pipeline["vector_store"], top_k=3)
        pipeline["retriever"] = retriever  # Add retriever to pipeline for later use

        logger.info("Pipeline configured successfully.")
        print("\n[INFO] Pipeline configured successfully.")

        tokenizer = AutoTokenizer.from_pretrained(selected_model)

    except Exception as e:
        logger.error(f"Pipeline configuration failed: {e}")
        print(f"[ERROR] Pipeline configuration failed: {e}")
        return

    # Helper function to collect filter inputs
    def get_filters():
        filters = {}
        filter_bank = input("Do you want to filter by a specific bank? (yes/no): ").strip().lower()
        if filter_bank == "yes":
            filters["bank"] = input("Enter the bank name: ").strip().lower()
        year = input("Enter the year to filter (or press Enter to skip): ").strip()
        if year:
            filters["year"] = year
        quarter = input("Enter the quarter to filter (e.g., Q1, or press Enter to skip): ").strip()
        if quarter:
            filters["quarter"] = quarter
        return filters

    # Step 2: Interactive Query Interface
    try:
        while True:
            print('\nRAG Based Analysis:')
            print("-------------------\n")
            print("\n--- Query Selection Menu ---")
            print("1. Ask a generic question")
            print("2. Compare two quarters of the same year")
            print("3. Compare two quarters of different years")
            print("4. Year-over-year comparison")
            print("5. Analyze all quarters of a year")
            print("6. Analyze sentiment for a single quarter")
            print("7. Summarize a single quarter")
            print("8. Perform Topic Modeling with BERTopic")
            print("9. Perform Topic Modeling with Gensim LDA")
            print("10. Exit") #adjust exit accordingly

            choice = input("\nSelect an option (1-10): ").strip()

            if not choice.isdigit() or not (1 <= int(choice) <= 10):
                print("[ERROR] Invalid option. Please enter a number between 1 and 10.")
                continue

            choice = int(choice)

            if choice == 1:
                query = input("Enter your question: ").strip()
                filters = get_filters()
                answer = handle_generic_query(
                    pipeline=pipeline,
                    tokenizer=tokenizer,
                    summarization_pipeline=pipeline["summarization_pipeline"],
                    model_token_limit=model_token_limit,
                    filters=filters,
                    query=query,
                )
            elif choice == 2:
                year = input("Enter the year (e.g., 2023): ").strip()
                quarter1 = input("Enter Quarter 1 (e.g., Q1): ").strip()
                quarter2 = input("Enter Quarter 2 (e.g., Q2): ").strip()
                bank = input("Enter the bank name: ").strip().lower()
                query = "Compare the financial performance of these quarters."
                answer = handle_compare_quarters_same_year(
                    pipeline=pipeline,
                    tokenizer=tokenizer,
                    model_token_limit=model_token_limit,
                    year=year,
                    quarter1=quarter1,
                    quarter2=quarter2,
                    bank=bank,
                    query=query,
                )
            elif choice == 3:
                year1 = input("Enter Year 1 (e.g., 2022): ").strip()
                quarter1 = input("Enter Quarter 1 (e.g., Q1): ").strip()
                year2 = input("Enter Year 2 (e.g., 2023): ").strip()
                quarter2 = input("Enter Quarter 2 (e.g., Q2): ").strip()
                bank = input("Enter the bank name: ").strip().lower()
                query = "Compare the performance of these quarters across years."
                answer = handle_compare_quarters_diff_years(
                    pipeline=pipeline,
                    tokenizer=tokenizer,
                    model_token_limit=model_token_limit,
                    year1=year1,
                    year2=year2,
                    quarter1=quarter1,
                    quarter2=quarter2,
                    bank=bank,
                    query=query,
                )
            elif choice == 4:
                year1 = input("Enter Year 1 (e.g., 2022): ").strip()
                year2 = input("Enter Year 2 (e.g., 2023): ").strip()
                bank = input("Enter the bank name: ").strip().lower()
                query = "Provide a year-over-year performance comparison."
                answer = handle_year_comparison(
                    pipeline=pipeline,
                    tokenizer=tokenizer,
                    model_token_limit=model_token_limit,
                    year1=year1,
                    year2=year2,
                    bank=bank,
                    query=query,
                )
            elif choice == 5:
                year = input("Enter the year (e.g., 2023): ").strip()
                bank = input("Enter the bank name: ").strip().lower()
                query = "Analyze all quarters of the given year."
                answer = handle_compare_quarters_same_year(
                    pipeline=pipeline,
                    tokenizer=tokenizer,
                    model_token_limit=model_token_limit,
                    year=year,
                    quarter1="Q1",
                    quarter2="Q4",  # Analyzing from Q1 to Q4
                    bank=bank,
                    query=query,
                )
            elif choice == 6:
                year = input("Enter the year (e.g., 2023): ").strip()
                quarter = input("Enter the quarter (e.g., Q1): ").strip()
                bank = input("Enter the bank name: ").strip().lower()
                query = "Analyze the sentiment for this quarter."
                answer = handle_sentiment_single_quarter(
                    pipeline=pipeline,
                    tokenizer=tokenizer,
                    model_token_limit=model_token_limit,
                    year=year,
                    quarter=quarter,
                    bank=bank,
                    query=query,
                )
            elif choice == 7:
                year = input("Enter the year (e.g., 2023): ").strip()
                quarter = input("Enter the quarter (e.g., Q1): ").strip()
                bank = input("Enter the bank name: ").strip().lower()
                query = "Summarize the performance for this quarter."
                answer = handle_summarize_single_quarter(
                    pipeline=pipeline,
                    tokenizer=tokenizer,
                    model_token_limit=model_token_limit,
                    year=year,
                    quarter=quarter,
                    bank=bank,
                    query=query,
                )
            elif choice == 8:
                print("\n[INFO] Performing Topic Modeling with BERTopic...")
                filters = get_filters()
                filtered_docs = filter_documents(pipeline["documents"], **filters)
                if not filtered_docs:
                    print("[ERROR] No documents matched the specified filters for BERTopic.")
                    continue
                handle_bertopic(filtered_docs)
                answer = "BERTopic modeling completed successfully."
            elif choice == 9:
                print("\n[INFO] Performing Topic Modeling with Gensim LDA...")
                filters = get_filters()
                filtered_docs = filter_documents(pipeline["documents"], **filters)
                if not filtered_docs:
                    print("[ERROR] No documents matched the specified filters for Gensim LDA.")
                    continue
                handle_gensim_lda(filtered_docs, nlp_model=nlp_model, num_topics=6)
                answer = "Gensim LDA modeling completed successfully."
            elif choice == 10:
                print("Exiting the application. Goodbye!")
                break

            display_wrapped_output(remove_repetitions(answer), width=100)

    except Exception as e:
        logger.exception(f"An unexpected error occurred in the main function: {e}")
        print(f"[ERROR] An unexpected error occurred: {e}")


In [None]:
pip install --upgrade transformers

## Main call

In [None]:
if __name__ == '__main__':
    main()

#Enhanced RAG Pipeline for future implementation


In [None]:
def extract_lda_topics_for_document(row, dictionary, lda_model):
    if "lda_topics" not in row or not isinstance(row["lda_topics"], str) or not row["lda_topics"]:
        logger.warning(f"Skipping row with missing or empty 'lda_topics': {row}")
        return []

    doc_bow = dictionary.doc2bow(row["lda_topics"].split())  # Tokenized content
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0.05)
    return [f"Topic {topic[0]}: {topic[1]:.2f}" for topic in topic_distribution]

def extract_lda_topics_for_context(context, dictionary, lda_model):
    """
    Extract topics for a given context using the LDA model.

    Args:
        context (str): The text content to analyze.
        dictionary (Dictionary): Gensim dictionary for the corpus.
        lda_model (LdaModel): Trained Gensim LDA model.

    Returns:
        list: A list of topics and their probabilities.
    """
    if not isinstance(context, str) or not context.strip():
        logger.warning(f"Invalid or empty context: {context}")
        return []

    doc_bow = dictionary.doc2bow(context.split())  # Tokenized content
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0.05)
    return [f"Topic {topic[0]}: {topic[1]:.2f}" for topic in topic_distribution]


def configure_rag_pipeline_future(
    folder_path: str,
    metadata_path: str,
    context: str = None,
    model_name: str = "microsoft/Phi-3-mini-4k-instruct",
    embeddings_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
    model_token_limit: int = 4096,
    top_k: int = 3,
    max_new_tokens: int = 500,
    chunk_size: int = None,
    chunk_overlap: int = 200,
    required_metadata_fields: Set[str] = None,
    persist_directory: str = "qa_index",
    device: str = "cuda",
    num_topics: int = 3,
    chain_type: str = "stuff",
) -> dict:
    """
    Configures a Retrieval-Augmented Generation (RAG) pipeline with QA chains, LDA, and other components.

    Returns:
        dict: A dictionary containing all pipeline components.
    """
    try:
        # Step 1: Validate file paths
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"The specified folder path does not exist: {folder_path}")
        if not os.path.exists(metadata_path):
            raise FileNotFoundError(f"The metadata file does not exist: {metadata_path}")

        # Step 2: Calculate dynamic chunk size
        if chunk_size is None:
            prompt_tokens = 200
            token_budget = model_token_limit - prompt_tokens - max_new_tokens
            chunk_size = min(2000, token_budget // 2)
            logger.info(f"Dynamic chunk size set to: {chunk_size}")

        # Step 3: Adjust chunk overlap if necessary
        if chunk_overlap is None or chunk_overlap >= chunk_size:
            chunk_overlap = chunk_size // 5
            logger.info(f"Dynamic chunk overlap set to: {chunk_overlap}")

        # Step 4: Load and preprocess documents
        logger.info("Loading and preprocessing documents...")
        enriched_documents, document_chunks = load_and_preprocess_data(
            folder_path=folder_path,
            metadata_path=metadata_path,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            required_metadata_fields=required_metadata_fields,
        )

        # Step 5: Initialize embeddings and vector store
        logger.info("Initializing embeddings and vector store...")
        vector_store = initialize_embeddings_and_vector_store(
            chunks=document_chunks,
            embeddings_model_name=embeddings_model_name,
            persist_directory=persist_directory,
            use_existing=True,
        )

        # Step 6: Setup retriever and language model
        retriever = setup_retriever(vector_store, top_k=top_k)
        llm_pipeline = load_model(model_name, max_new_tokens=max_new_tokens, device=device)

        # Step 7: Load and validate metadata
        metadata_df = pd.read_csv(metadata_path)
        if metadata_df.empty:
            raise ValueError("Metadata file is empty. Ensure the metadata CSV is correctly populated.")
        logger.info(f"Loaded metadata with {len(metadata_df)} entries.")

        # Step 8: Initialize summarization and sentiment pipelines
        summarization_pipeline = hf_pipeline(
            "summarization", model="sshleifer/distilbart-cnn-6-6", device=0 if device == "cuda" else -1
        )
        sentiment_pipeline = hf_pipeline(
            "sentiment-analysis", model="yiyanghkust/finbert-tone", device=0 if device == "cuda" else -1
        )

        # Step 9: Perform LDA on metadata
        lda_model_metadata, dictionary_metadata, topics_metadata = None, None, None
        try:
            logger.info("Performing LDA on metadata...")
            corpus = [doc.page_content for doc in enriched_documents if doc.page_content]
            processed_corpus = preprocess_corpus_gensim(corpus, custom_stopwords=None, redundant_terms=None)
            dictionary_metadata = Dictionary(processed_corpus)
            bow_corpus_metadata = [dictionary_metadata.doc2bow(doc) for doc in processed_corpus]
            lda_model_metadata = LdaModel(
                corpus=bow_corpus_metadata,
                id2word=dictionary_metadata,
                num_topics=num_topics,
                random_state=42,
                passes=10,
                iterations=50,
            )
            topics_metadata = extract_topics_from_lda(lda_model_metadata, bow_corpus_metadata, dictionary_metadata)
            metadata_df["lda_topics"] = topics_metadata
        except Exception as e:
            logger.warning(f"Error performing LDA on metadata: {e}")
            metadata_df["lda_topics"] = None

        # Step 10: Perform LDA on context
        lda_model_context, dictionary_context, topics_context = None, None, None
        if context:
            try:
                logger.info("Performing LDA on context...")
                processed_context = preprocess_context(context, nlp_model=None)
                dictionary_context = Dictionary([processed_context])
                bow_corpus_context = [dictionary_context.doc2bow(processed_context)]
                lda_model_context = LdaModel(
                    corpus=bow_corpus_context,
                    id2word=dictionary_context,
                    num_topics=num_topics,
                    random_state=42,
                    passes=10,
                    iterations=50,
                )
                topics_context = extract_lda_topics_for_context(context, dictionary_context, lda_model_context)
            except Exception as e:
                logger.warning(f"Error performing LDA on context: {e}")

        # Step 11: Initialize QA Chains
        qa_chains = None
        try:
            logger.info("Initializing QA chains...")
            qa_chains = initialize_chains(llm_pipeline, chain_type=chain_type)
        except Exception as e:
            logger.error(f"Error initializing QA chains: {e}")

        # Step 12: Return pipeline components
        logger.info("RAG pipeline configured successfully.")
        return {
            "retriever": retriever,
            "vector_store": vector_store,
            "llm_pipeline": llm_pipeline,
            "metadata_df": metadata_df,
            "documents": enriched_documents,
            "summarization_pipeline": summarization_pipeline,
            "sentiment_pipeline": sentiment_pipeline,
            "lda_model_metadata": lda_model_metadata,
            "lda_topics_metadata": topics_metadata,
            "lda_model_context": lda_model_context,
            "lda_topics_context": topics_context,
            "qa_chains": qa_chains,
        }

    except Exception as e:
        logger.exception(f"Pipeline configuration failed: {e}")
        return None
