In [1]:
from src.loaders.pdf_loader import PDFLoader
from src.processors.text_processor import TextProcessor
from src.embeddings.vector_store import VectorStore
from src.retrieval.rag_pipeline import RAGPipeline
from src.utils.exceptions import PDFLoadError, QueryError
from src.utils.logger import get_logger
from config.settings import (
    PDF_DIRECTORY, EMBEDDING_MODEL_NAME, LLM_MODEL_NAME, VECTOR_STORE_PATH,
    CHUNK_SIZE, CHUNK_OVERLAP
)

from dotenv import load_dotenv
load_dotenv()

True

In [6]:
import re
def remove_think_tags(text):
    """Removes text between <think>...</think> tags, including the tags themselves."""
    cleaned_text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    cleaned_text = cleaned_text.lstrip("\n")  # Remove leading newlines
    return cleaned_text

# Example usage
input_text = """<think> Okay, I need to figure out how to help the user using DesaIsiv's platform for insurance quotes. They provided a lot of information about the company and their features. Let me break this down step by step.
First, looking at the context, the user is asking if DesaIsiv offers quote comparison or request new quote features. The system message mentions that the platform provides these options because it's tailored for insurance processes like claims management and quotation.

In the feature usage section, there are mentions of Claims Experience and Quotation Comparison as key features. So yes, they do offer those.

Looking at the required files, I see Claims Experience is needed for historical data analysis, which supports quote comparison. The user might also need an Active List to generate accurate quotations, so that's another point to mention.

The contact information is provided in case the user has more questions after using these features.

So putting it all together, the response should confirm they have these features, explain how each works (like using historical claims for pricing), and list the necessary files. </think>

Yes, DesaIsiv's platform offers both quote comparison and request new quote features. Here's a detailed breakdown:

Quote Comparison:"""
output_text = remove_think_tags(input_text)


In [8]:
output_text

"Yes, DesaIsiv's platform offers both quote comparison and request new quote features. Here's a detailed breakdown:\n\nQuote Comparison:"

In [3]:
logger = get_logger(__name__)

if __name__ == "__main__":

    logger.info("Loading PDFs...")
    pdf_loader = PDFLoader(PDF_DIRECTORY)
    pdf_docs = pdf_loader.load_pdfs() 
    if not pdf_docs:
        raise PDFLoadError("No PDFs found in the directory.")

    # ✅ Step 2: Process Text
    logger.info("Splitting text into chunks...")
    text_processor = TextProcessor(CHUNK_SIZE, CHUNK_OVERLAP)
    documents = text_processor.split_documents(pdf_docs) 

    logger.info("Storing embeddings...")
    vector_store = VectorStore(EMBEDDING_MODEL_NAME)
    vector_store.build_vector_store(documents)
    vector_store.save_vector_store(VECTOR_STORE_PATH)
    
    logger.info("Loading vector store...")
    vector_store.load_vector_store(VECTOR_STORE_PATH) 
    rag = RAGPipeline(vector_store, model_name=LLM_MODEL_NAME)

    logger.info("Pipeline is Ready!")
    
    query = "What is the main topic of the PDFs?"
    logger.info(f"Processing query: {query}")

    response = rag.retrieve_and_generate(query)

    logger.info(f"Response {response}")

2025-03-11 21:45:28 - INFO - Loading PDFs...
2025-03-11 21:45:28 - INFO - Splitting text into chunks...
2025-03-11 21:45:28 - INFO - Storing embeddings...
  from .autonotebook import tqdm as notebook_tqdm
2025-03-11 21:45:35 - INFO - Loading vector store...
2025-03-11 21:45:36 - INFO - Pipeline is Ready!
2025-03-11 21:45:36 - INFO - Processing query: What is the main topic of the PDFs?
2025-03-11 21:45:44 - INFO - Response <think>
Okay, I need to figure out the main topic of the provided PDF content. Let me read through it carefully.

First, the context starts by mentioning that Desaisiv, an insurance company in Saudi Arabia, developed a platform to help insurers revise offers, negotiate terms, and finalize agreements with clients. It also talks about streamlining the quotation process and provider comparison. 

The required files section lists Claims Experience, Active List, Raw Data (optional), and Providers List, explaining their use.

The support contact details are given at the en

In [27]:
len(embeddings)

2

In [26]:
embeddings[0].shape

(1, 384)