# RAG SYSTEM

In [None]:
# Initialize the script environment
import sys


sys.path.append('./utils')

In [None]:
# General imports
import os
from typing import List, Dict, Optional, Any
from logging_config import logger
from io_docs import save_dict_list_as_json, load_json_as_dict_list, dict_list_to_text

## 1. Text Extraction

In [None]:
#imports
from utils.text_extractor import TextExtractor

In [None]:
SRC_FOLDER_PATH = "../../src/first_batch/"
OUTPUT_FOLDER_PATH = "./output/extracted_text/"
LIMIT = 0
n_extracted = 0

for file_name in os.listdir(SRC_FOLDER_PATH):
    if file_name.endswith('.pdf') and (n_extracted < LIMIT or LIMIT == -1):
        file_path = os.path.join(SRC_FOLDER_PATH, file_name)

        if f"{os.path.basename(file_path).split('.')[0]}_extracted.json" in os.listdir(OUTPUT_FOLDER_PATH):
            logger.info(f"Skipping already processed file: {file_path}")
            continue

        logger.info(f"Processing file: {file_path}")

        # Initialize the TextExtractor with the file path and parameters
        text_extractor = TextExtractor(file_path, min_words=14)
        
        # Extract text from the PDF
        text_extractor.extract_text_advanced()
        
        # Clean the extracted text
        text_extractor.clean_text()

        # Saving extracted text to folder
        os.makedirs(OUTPUT_FOLDER_PATH, exist_ok=True)
        output_file_path = os.path.join(OUTPUT_FOLDER_PATH, f"{os.path.basename(file_path).split('.')[0]}_extracted.json")
        save_dict_list_as_json(text_extractor.get_data(), output_file_path)

        n_extracted += 1

    if n_extracted >= LIMIT and LIMIT != -1:
        logger.info(f"Reached limit of {LIMIT} files processed. Stopping.")
        break

print()

## 2. Chunking

In [None]:
# Imports
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from embeddings import get_embedding_model

In [None]:
def chunk_text_semantic(pages: List[Dict], embeddings, breakpoint_threshold_type = "percentile", breakpoint_threshold_amount: float = 95, min_chunk_size: int = 100) -> List[Dict]:
    """
    Splits page texts using LangChain's SemanticChunker.
    Groups sentences based on semantic similarity, not just size.
    Information on how to use the chunker can be found in the LangChain documentation: https://python.langchain.com/docs/how_to/semantic-chunker/

    Parameters:
        pages: List of dicts with keys: doc_name, page_num, text
        embeddings: The embeddings model to use for semantic chunking
        breakpoint_threshold_type: Type of threshold for chunking (default is "percentile")
        breakpoint_threshold_amount: Amount for the threshold (default is 95)
        min_chunk_size: Minimum size of chunks (default is 100)

    Returns:
        List of dicts with chunked text and metadata
    """
    chunker = SemanticChunker(embeddings, breakpoint_threshold_type=breakpoint_threshold_type, breakpoint_threshold_amount=breakpoint_threshold_amount, min_chunk_size=min_chunk_size) # type: ignore

    chunks = []
    doc_name = pages[0].get("doc_name", "unknown") if pages else "unknown"

    for page in pages:
        page_num = page.get("page_number", -1)
        try:
            text = page.get("text", "")
            chunked_texts = chunker.split_text(text)
            for i, chunk in enumerate(chunked_texts):
                chunks.append({
                    "doc_name": doc_name,
                    "page_number": page_num,
                    "chunk_id": f"{doc_name}_p{page_num}_c{i}",
                    "text": chunk,
                    "num_words": len(chunk.split()),
                    "extraction_method": page.get("extraction_method", "unknown"),
                    "table_bboxes": page.get("table_bboxes", [])
                })
        except Exception as e:
            logger.error(f"⚠️ Semantic chunking failed on {doc_name} page {page_num}: {e}")

    logger.info(f"✅ Created {len(chunks)} semantic chunks from {len(pages)} pages in {doc_name}.\n")
    return chunks

In [None]:
embeddings = get_embedding_model(1)

In [None]:
SRC_FOLDER_PATH = "./output/extracted_text/"
OUTPUT_FOLDER_PATH = "./output/chunked_text/"
LIMIT = 0
n_chunked = 0

os.makedirs(OUTPUT_FOLDER_PATH, exist_ok=True)

for i, doc in enumerate(os.listdir(SRC_FOLDER_PATH)):
    if doc.endswith('.json') and (n_chunked < LIMIT or LIMIT == -1):
        file_path = os.path.join(SRC_FOLDER_PATH, doc)

        if f"{os.path.basename(file_path).split('.')[0]}_chunked.json" in os.listdir(OUTPUT_FOLDER_PATH):
            logger.info(f"Skipping already processed file: {file_path}")
            continue

        logger.info(f"Processing file: {file_path}")

        # Load the extracted text data
        doc_pages = load_json_as_dict_list(file_path)

        # Chunk the text semantically
        chunks = chunk_text_semantic(doc_pages, embeddings, min_chunk_size=200, breakpoint_threshold_type="gradient")

        # Save the chunks to a new JSON file
        output_file_path = os.path.join(OUTPUT_FOLDER_PATH, f"{os.path.basename(file_path).split('.')[0]}_chunked.json")
        save_dict_list_as_json(chunks, output_file_path)

        n_chunked += 1

    if n_chunked >= LIMIT and LIMIT != -1:
        logger.info(f"Reached limit of {LIMIT} files processed. Stopping.")
        break

## 3. Embedding

#### 3.1. Documents preparation

In [None]:
from langchain.schema import Document

In [None]:
# Converting chunks into LangChain Documents
def prepare_documents(chunks: List[Dict]) -> List[Document]:
    """
    Converts each chunk dict to a LangChain Document object
    with metadata (doc_name, page_num, chunk_id)

    Args:
        chunks: List of chunk dictionaries

    Returns:
        List of LangChain Document objects
    """
    documents = []
    for chunk in chunks:
        metadata = {
            "doc_name": chunk.get("doc_name", "unknown"),
            "page_number": chunk.get("page_number", -1),
            "chunk_id": chunk.get("chunk_id", "unknown"),
            "text": chunk.get("text", ""),
            "num_words": chunk.get("num_words", 0),
            "extraction_method": chunk.get("extraction_method", "unknown"),
            "table_bboxes": chunk.get("table_bboxes", []),
        }
        documents.append(Document(page_content=chunk["text"], metadata=metadata))
    return documents

#### 3.2. Build the FAISS index from Documents using the embeddings model

In [None]:
from langchain.vectorstores import FAISS


FAISS_PATH = "faiss_index"

In [None]:
# Build the FAISS index from Documents
def build_faiss_index(documents: List[Document], embeddings: HuggingFaceEmbeddings, persist_path: str = FAISS_PATH) -> FAISS:
    """
    Creates or updates a FAISS vector store from a list of LangChain Documents.
    If the index already exists, only adds new documents instead of recreating it.
    Saves it to disk for future use.

    Args:
        documents: List of LangChain Document objects
        embeddings: The embeddings model to use for FAISS indexing
        persist_path: Path to save the FAISS index

    Returns:
        FAISS vector store
    """
    import os
    
    # Check if FAISS index already exists
    if os.path.exists(os.path.join(persist_path, "index.faiss")):
        print(f"🔄 Loading existing FAISS index from '{persist_path}'")
        # Load existing index
        vectorstore = FAISS.load_local(persist_path, embeddings, allow_dangerous_deserialization=True)
        
        # Get existing document IDs to avoid duplicates
        existing_ids = set(vectorstore.docstore._dict.keys())
        
        # Filter out documents that might already exist in the index
        # This assumes documents have unique IDs in their metadata
        new_docs = []
        for doc in documents:
            # Use chunk_id as a unique identifier if available
            doc_id = doc.metadata.get("chunk_id", None)
            if doc_id and doc_id not in existing_ids:
                new_docs.append(doc)
        
        if new_docs:
            print(f"➕ Adding {len(new_docs)} new documents to existing index")
            vectorstore.add_documents(new_docs)
            # Save the updated index
            vectorstore.save_local(persist_path)
            print(f"✅ FAISS index updated and saved to '{persist_path}'\n")
        else:
            print(f"ℹ️ No new documents to add to the index\n")
    else:
        print(f"🆕 Creating new FAISS index")
        # Build the index in memory with all documents
        vectorstore = FAISS.from_documents(documents, embedding=embeddings)
        # Save to disk
        vectorstore.save_local(persist_path)
        print(f"✅ FAISS index built and saved to '{persist_path}'\n")
    
    return vectorstore

In [None]:
SRC_FOLDER_PATH = "./output/chunked_text/"
OUTPUT_FOLDER_PATH = "./output/faiss_index/"
os.makedirs(os.path.abspath(OUTPUT_FOLDER_PATH), exist_ok=True)
LIMIT = 0
n_embedded = 0


already_processed = set()
with open(os.path.join(OUTPUT_FOLDER_PATH, "processed_files.txt"), "a+") as f:
    f.seek(0)
    for line in f:
        already_processed.add(line.strip())

for file_name in os.listdir(SRC_FOLDER_PATH):
    if file_name.endswith('.json') and file_name not in already_processed and (n_embedded < LIMIT or LIMIT == -1):
        file_path = os.path.join(SRC_FOLDER_PATH, file_name)
        chunks = load_json_as_dict_list(file_path)
        
        logger.info(f"Loaded {len(chunks)} chunks from {file_path}")
        
        if len(chunks) > 100:
            for i in range(0, len(chunks), 100):
                documents = prepare_documents(chunks[i:i + 100])
                build_faiss_index(documents, embeddings)
        else:
            documents = prepare_documents(chunks)
            build_faiss_index(documents, embeddings)

        with open(os.path.join(OUTPUT_FOLDER_PATH, "processed_files.txt"), "a") as f:
            f.write(file_name + "\n")
        
        logger.info(f"Processed and indexed {file_name}")
        n_embedded += 1
    
    if n_embedded >= LIMIT and LIMIT != -1:
        logger.info(f"Reached limit of {LIMIT} files processed. Stopping.")
        break

#documents = prepare_documents(chunks)
#build_faiss_index(documents, embeddings)

#### 3.3. Visualize the FAISS index

In [None]:
# Imports
from visualizer import visualize_2D_colored
from io_docs import load_vectorstore

In [None]:
vectorstore, documents, all_embeddings, metadatas = load_vectorstore(FAISS_PATH, embeddings)

In [None]:
visualize_2D_colored(all_embeddings, documents)

In [None]:
# delete docs or ids...
def delete_from_faiss(doc_name:str):
    vectorstore = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
    print("before: ", vectorstore.index.ntotal)
    documents = list(vectorstore.docstore._dict.values())
    for doc in documents:
        if doc.metadata["doc_name"] == doc_name:
            print(doc.metadata["chunk_id"])
            vectorstore.delete(ids=[doc.id])

    vectorstore.save_local(FAISS_PATH)
    print("after: ", vectorstore.index.ntotal)

    visualize_2D_colored()

#### 4. Retrieval

##### 4.1. Semantic Search

In [None]:
from io_docs import load_faiss_index

In [None]:
def search_similar_chunks(query: str, k: int = 10, vectorstore: FAISS = load_faiss_index(FAISS_PATH, embeddings)) -> List[dict]:
    """
    Embeds the query and retrieves top-k semantically similar chunks from FAISS.

    Args:
        query (str): The search query string.
        k (int): The number of similar chunks to retrieve.
        embeddings (HuggingFaceEmbeddings): The embeddings model to use.

    Returns: 
        List of dicts with chunk text + metadata.
    """
    results = vectorstore.similarity_search(query, k=k)

    return [
        {
            "text": doc.page_content,
            "doc_name": doc.metadata.get("doc_name", ''),
            "page_num": doc.metadata.get("page_num", -1),
            "chunk_id": doc.metadata.get("chunk_id", ''),
            "n_words": doc.metadata.get("num_words", -1)
        }
        for doc in results
    ]

In [None]:
vectorstore = load_faiss_index(FAISS_PATH, embeddings)
query = "Définir une aiguille"
k = 5
res = search_similar_chunks(query, k, vectorstore)
print(res)

#### 5. Answer generation using LM Studio

In [None]:
PROMPT1 = f"""
En tant qu'ancien collaborateur de la Compagnie des Signaux, spécialisé dans la signalisation ferroviaire, vous possédez une expertise complète des termes techniques et des documents internes du secteur.
En utilisant uniquement les informations contenues dans les documents internes ci-après, répondez concisement à la question suivante avec une réponse claire précise et technique.
Si les informations disponibles ne suffisent pas, indiquez-le clairement. N'inventez aucune réponse.

### Informations Références :
{{context}}

### Questions :
{{query}}

### Réponse :
"""

In [None]:
PROMPT2 = """
Vous êtes un ancien collaborateur de la Compagnie des Signaux, expert en signalisation ferroviaire. 
Votre rôle est d’analyser les documents internes fournis et de répondre de manière claire, précise et techniquement correcte.

Règles impératives :
1. Utilisez uniquement les informations figurant dans la section "Informations Références".
2. Ne déduisez ou n’inventez rien qui ne soit pas explicitement mentionné.
3. Si les informations sont insuffisantes pour répondre, écrivez uniquement : "Informations insuffisantes pour répondre."
4. La réponse doit être concise, technique, et directement liée à la question.
5. Ne fournissez aucune opinion personnelle ou information externe.
6. Respectez le vocabulaire technique du domaine ferroviaire.

### Informations Références :
{{context}}

### Question :
{{query}}

### Réponse :
"""


In [None]:
# Imports
import requests

In [None]:
def generate_answer_with_lmstudio(query: str, context_chunks: list, prompt:str = PROMPT1, max_tokens: int = 200) -> str:
    """
    Sends a prompt to LM Studio's local API and returns the generated answer.

    Args:
        prompt (str): The prompt template with placeholders for context and query.
        query (str): The search query string.
        context_chunks (list): List of context chunks to include in the prompt.
        max_tokens (int): Maximum number of tokens to generate in the response.

    Returns:
        str: The generated answer from LM Studio.
    """

    # Build RAG-style prompt
    context_text = "\n\n".join(list(map(lambda c: c.get("text", ""), context_chunks)))
    prompt = prompt.replace("{context}", context_text)
    prompt = prompt.replace("{query}", query)

    # Call LM Studio's local API
    response = requests.post("http://localhost:1234/v1/completions", json={
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.2,
        "stop": None,
    })

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code} {response.text}")

    return response.json()["choices"][0]["text"].strip()

In [None]:
ctx = search_similar_chunks(query, k, vectorstore)
response = generate_answer_with_lmstudio(query, ctx, prompt=PROMPT2, max_tokens=200)
print(response)