# Contextual RAG

## 1 - Librerias

In [1]:
from typing import List, Tuple

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate



from langchain.retrievers import ContextualCompressionRetriever,BM25Retriever,EnsembleRetriever
from langchain_community.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import FlashrankRerank, DocumentCompressorPipeline


from langchain_community.document_loaders import PyPDFLoader

from tqdm import tqdm
import pickle
import time

import os
from dotenv import load_dotenv
load_dotenv()

True

## 2 - OpenAI API Key


In [2]:
# api key

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

## 3 - Crear Vector DB

In [3]:
class VectorDB:


    def __init__(self, collection_name: str):
        """
        Inicializar los atributos:
        
        + collection_name: str, nombre de la coleccion de la base de datos
        + text_splitter: RecursiveCharacterTextSplitter, objecto para crear chunks desde documento
        + embeddings: OpenAIEmbeddings, modelo para embeddings
        + llm: ChatOpenAI, modelo gpt-4o
        """
        
        self.collection_name = collection_name
        
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=800,
                                                            chunk_overlap=100)
        
        self.embeddings = OpenAIEmbeddings()

        
        self.llm = ChatOpenAI(model='gpt-4o', temperature=0)
        
        
    
    def process_document(self, file_paths: str) -> List[Document]:
        
        """
        Procesar cada documento dividiendolo en chunks y generando contexto para cada uno
        
        Params:
        file_paths: lista de strings, rutas a los archivos pdf o txt 
        
        Return:
        lista de chunks
        """
        
        contextualized_chunks = []
        
        
        for file_path in file_paths:
        
        
            # procesar pdf 
            if file_path.endswith('.pdf'):

                loader = PyPDFLoader(file_path)

                pages = loader.load()

                for i in tqdm(range(0, len(pages)-2, 1), leave=False, desc='Chunking PDF file'):

                    document = pages[i].page_content + pages[i+1].page_content + pages[i+2].page_content

                    chunks = self.text_splitter.create_documents([pages[i+1].page_content])

                    chunk_with_context = self._generate_contextualized_chunks(document, chunks, file_path)
                    
                    contextualized_chunks += chunk_with_context


            # procesar txt 
            elif file_path.endswith('.txt'):

                with open(file_path, 'r') as file:

                    document = file.read()

                    chunks = self.text_splitter.create_documents([document])

                    chunk_with_context = self._generate_contextualized_chunks(document, chunks, file_path)

                    contextualized_chunks += chunk_with_context

                
        return contextualized_chunks
    
    
    def _generate_contextualized_chunks(self, document: str, chunks: List[Document], file_path: str) -> List[Document]:
        
        """
        Genera versiones contextualizadas de los chunks dados.
        
        Params:
        document: str, documento completo para recuperar contexto
        chunks: lista de chunks sin contexto
        file_path: str, ruta del archivo original
        
        Return:
        lista de chunks contextualizados
        """
        
        contextualized_chunks = []
        
        for chunk in tqdm(chunks, leave=False, desc='Generating chunk context'):
            
            # crear contexto
            context = self._generate_context(document, chunk.page_content)
            
            contextualized_content = f'{context}\n\n{chunk.page_content}'
            
            # traducir al castellano el chunk 
            contextualized_content = self._translate_chunks(contextualized_content)
            
            # añadir fuente source
            source = file_path.split('/')[-1].split('.')[0].replace('_', ' ').title()
            contextualized_content = f'<documento> FUENTE: {source}. '+contextualized_content+'<documento>'

            
            contextualized_chunks.append(Document(page_content=contextualized_content, 
                                                  metadata=chunk.metadata))
        
        return contextualized_chunks
    
    
    def _generate_context(self, document: str, chunk: str) -> str:
        
        """
        Genera contexto para un chunk especifico usando un llm. 
        
        Params:
        document: str, documento complete para sacar contexto
        chunks: chunk sin contexto
        
        Return:
        str, chunk con contexto
        """
        
        system_prompt = '''You are an AI assistant specializing in design systems. 
                           Your task is to provide brief, relevant context for a chunk of text 
                           from the document provided.
                           Here is the document:
                           <document>
                           {document}
                           </document>

                           Here is the chunk we want to situate within the whole document::
                           <chunk>
                           {chunk}
                           </chunk>

                           Provide a concise context (2-3 sentences) for this chunk, considering the 
                           following guidelines:
                           
                           1. Do not use phrases like "This chunk discusses", "The chunk focuses"
                              ,"This section provides", or any other reference to summaring. 
                              Avoid any reference to summaring. Instead, directly state the context.
                              Just give the context.
                              Do not use phrases like "This chunk discusses" or "This section provides". 
                              Instead, directly state the context.

                           
                           2. Identify the main topic or metric discussed (e.g., archetypes, dynamics, 
                              hierarchy, system).
                           
                           3. Mention any relevant time periods or comparisons.
                           
                           4. If applicable, note how this information relates to design, strategy, 
                              or market position.
                           
                           5. Include any key figures or percentages that provide important context.
                           

                           Please give a short succinct context to situate this chunk within the overall 
                           document for the purposes of improving search retrieval of the chunk. 
                           Answer only with the succinct context and nothing else.

                           Context:
                           '''
        
        prompt = ChatPromptTemplate.from_template(system_prompt)
        
        messages = prompt.format_messages(document=document, chunk=chunk)
        
        response = self.llm.invoke(messages).content
        
        return response
    
    
    
    def _translate_chunks(self, chunk: str) -> str:
        
        """
        Traducir a castellano todos los chunks.
        
        Params:
        chunk: str, chunk sin traducir
        
        Return:
        str, chunk en castellano
        """
        
        system_prompt = '''You are a good translator to spanish.
                           Given the next chunk translate to spanish:
                           
                           <chunk>
                           {chunk}
                           </chunk>
                           
                           Just give the traslation, do not comment anything.
                           If the chunk is already in spanish, repeat the chunk.
                           '''
        
        prompt = ChatPromptTemplate.from_template(system_prompt)
        
        messages = prompt.format_messages(chunk=chunk)

        response = self.llm.invoke(messages)
            
        return response.content
    
    
    
    def create_vectorstore(self, chunks: List[Document]) -> None:
        
        """
        Crea una DB vectorial de Chroma para guardar los chunks.
        
        Params:
        chunks: lista de chunks para guardar
        
        Return:
        None
        """
        
        vectordb =  Chroma.from_documents(chunks, 
                                          self.embeddings, 
                                          persist_directory='../data/chroma_db',
                                          collection_name=self.collection_name)
    
       
    
    def create_bm25_retriever(self, chunks: List[Document]) -> None:
        
        """
        Crea un retriever BM25 para los chunks dados.
        
        Params:
        chunks: lista de chunks para guardar
        
        Return:
        None
        """
        
        bm25_retriever = BM25Retriever.from_documents(chunks)
        
        
        # guardar objeto bm25 
        with open(f'../data/{self.collection_name}_bm25', 'wb') as bm25_file:
            pickle.dump(bm25_retriever, bm25_file)
    
    
    
    def store_to_db(self, file_paths: list) -> None:
        
        """
        Proceso completo de guardado en DB desde documento.
        
        Params:
        file_paths: lista de strings, rutas de los archivos para guardar en Chroma y BM25
        
        Return:
        None
        """
                
        chunks = self.process_document(file_paths)
                
        vectorstore = self.create_vectorstore(chunks)
        
        bm25 = self.create_bm25_retriever(chunks)

In [4]:
vectordb = VectorDB('design')

In [None]:
vectordb.store_to_db(['../data/thinking_systems_from_donella_meadows.pdf'])

Chunking PDF file:  93%|█████████▎| 181/194 [1:24:18<05:07, 23.62s/it]

## 4 - Recuperar desde VectorDB


In [None]:
def ensemble_retriever(collection_name: str) -> ContextualCompressionRetriever:
    
    """
    Recuperación desde ChromaDB y BM25.
    
    Params:
    collection_name: str, coleccion a ser usada 

    Return:
    ContextualCompressionRetriever, ChromaDB+BM25+Filter+ReRanker 
    """
    
    embeddings = OpenAIEmbeddings()
    
    # carga chromaDB
    retriver_chroma = Chroma(persist_directory='../data/chroma_db',
                             collection_name=collection_name, 
                             embedding_function=embeddings)
    
    retriver_chroma = retriver_chroma.as_retriever(search_type='mmr', search_kwargs={'k':20, 
                                                                                     'lambda_mult': 0.5})
    
    
    # carga BM25
    with open(f'../data/{collection_name}_bm25', 'rb') as bm25_file:
        bm25_retriever = pickle.load(bm25_file)
            
    
    bm25_retriever.k = 10
        
    ensemble_retriever = EnsembleRetriever(retrievers=[retriver_chroma, bm25_retriever],
                                           weights=[0.5, 0.5])

    redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)

    reranker = FlashrankRerank()

    pipeline_compressor = DocumentCompressorPipeline(transformers=[redundant_filter, reranker])

    compression_pipeline = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
                                                          base_retriever=ensemble_retriever)

    return compression_pipeline

In [7]:
%%time

retriever = ensemble_retriever('design')

CPU times: user 225 ms, sys: 85.7 ms, total: 311 ms
Wall time: 319 ms


In [8]:
%%time

response = retriever.invoke('¿que es un sistema complejo?')

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


CPU times: user 29.5 s, sys: 2.35 s, total: 31.9 s
Wall time: 9.09 s


In [9]:
len(response)

3

In [10]:
[e.page_content for e in response]

['<documento> FUENTE: Thinking Systems From Donella Meadows. <chunk>\nEl fragmento profundiza en el concepto de sistemas, enfatizando que un sistema es un conjunto interconectado de elementos organizados para lograr un propósito específico, con ejemplos como el sistema digestivo que ilustran esta definición. Destaca la complejidad y las interrelaciones dentro de los sistemas, sugiriendo que entender un sistema requiere reconocer sus elementos, interconexiones y función. Esta comprensión es crucial para el diseño y la estrategia, ya que informa cómo los sistemas pueden ser organizados y gestionados eficazmente para lograr los resultados deseados.\n\n/emdash.cap UNO /emdash.cap \nLos Fundamentos\nAún no he visto ningún problema, por complicado que sea, que, cuando se \nmira de la manera correcta, no se vuelva aún más complicado.\n—Poul Anderson1 \nMás que la Suma de sus Partes\nUn sistema no es solo cualquier colección de cosas. Un sistema* es un conjunto \ninterconectado de elementos qu

In [None]:
from IPython.display import display, Markdown

for i, e in enumerate(response, 1):
    display(Markdown(f"**Fragmento {i}:**\n\n{e.page_content.replace('<documento>', '').replace('</chunk>', '').replace('<chunk>', '---')}\n\n---"))