In [1]:
import utils_rag as ur
from rich.markdown import Markdown as rich_Markdown
from langchain_huggingface import HuggingFaceEmbeddings
import uuid
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_chroma import Chroma

In [None]:
import getpass
import os
from langchain_google_genai import ChatGoogleGenerativeAI


if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "..."



llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0
)

### Carga Documentos

In [None]:
file_name_aux_1 = 'Implementacion-y-aplicaciones-de-un-sistema-de-codificacion-automatica-de-la-lista-de-espera-chilena.pdf'
text_chunks_1, tables_html_chunks_1, images_b64_chunks_1 = ur.cargar_chunks_proc(file_name_aux_1,'doc_1')

text_chunks_summary_1, tables_html_chunks_summary_1, images_b64_chunks_summary_1 = ur.cargar_summary_proc(file_name_aux_1,'doc_1')


## Vector Store

In [None]:
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

In [None]:
path_retriver = "retriever"
ur.create_path_if_not_exists(path_retriver)

In [None]:
# Rutas de persistencia
persist_dir = path_retriver + "/chroma_db_v1"   # Directorio para Chroma
docstore_file = path_retriver + "/docstore_v1.pkl"         # Archivo para el docstore


# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="multi_modal_rag"
                     , embedding_function=embed_model
                     ,persist_directory=persist_dir,)


# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [None]:
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in text_chunks_1]
print(f'ID de textos:{doc_ids}')
print(f'Numero de textos:{len(doc_ids)}')

summary_texts=[Document(page_content=doc_s['summary'],
           metadata={id_key: doc_ids[i]
                     ,'page_number':str(doc_s['page_number'])
                     ,'file_name':doc_s['filename']
                     ,'file_type':'Text'}) for i, doc_s in enumerate(text_chunks_summary_1)]


retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, text_chunks_1)))

In [None]:
# Add tables
doc_ids = [str(uuid.uuid4()) for _ in tables_html_chunks_1]
print(f'ID de tablas:{doc_ids}')
print(f'Numero de tablas:{len(doc_ids)}')

summary_tables=[Document(page_content=doc_s['summary'],
           metadata={id_key: doc_ids[i]
                     ,'page_number':str(doc_s['page_number'])
                     ,'file_name':doc_s['filename']
                     ,'file_type':'Table'}) for i, doc_s in enumerate(tables_html_chunks_summary_1)]


retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(doc_ids, tables_html_chunks_1)))

In [None]:
# Add images
doc_ids = [str(uuid.uuid4()) for _ in images_b64_chunks_1]
print(f'ID de imagenes:{doc_ids}')
print(f'Numero de imagenes:{len(doc_ids)}')
summary_images=[Document(page_content=doc_s['summary'],
           metadata={id_key: doc_ids[i]
                     ,'page_number':str(doc_s['page_number'])
                     ,'file_name':doc_s['filename']
                     ,'file_type':'Imagen'}) for i, doc_s in enumerate(images_b64_chunks_summary_1)]


retriever.vectorstore.add_documents(summary_images)
retriever.docstore.mset(list(zip(doc_ids, images_b64_chunks_1)))

In [None]:
ur.save_docstore(store, docstore_file)

In [None]:
query="que es deekseek?"
#response = query_expansion_response(query)
# 1. Recuperar documentos
docs = retriever.invoke(query, top_k=5)
print(docs)

In [None]:
docs[1]

## Cargar Datos

In [3]:
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

path_retriver = "retriever"

# Rutas de persistencia
persist_dir = path_retriver + "/chroma_db_v1"   # Directorio para Chroma
docstore_file = path_retriver + "/docstore_v1.pkl"         # Archivo para el docstore

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="multi_modal_rag"
                     , embedding_function=embed_model
                     ,persist_directory=persist_dir,)


# The storage layer for the parent documents
store = ur.load_docstore(docstore_file)
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

## como se ajustan los documentos en un espacio vectorial

In [None]:
import umap
import numpy as np
from tqdm import tqdm

embeddings = retriever.vectorstore.get(include=['embeddings'])['embeddings']

In [None]:
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(embeddings)
projected_dataset_embeddings = ur.project_embeddings(embeddings, umap_transform)

In [None]:
query="como es la codificacion automatica de la lista de espera?"
ur.plot_umap_embeddings(umap_transform,projected_dataset_embeddings, retriever, query)

In [None]:
def query_expansion_response(query,llm):
    query = query
    messages = [
        ("system", "Eres un útil asistente experto en diversos tipos de archivos de animales y inteligencia artificial."
        " Proporciona un ejemplo de respuesta a la pregunta, que podría encontrarse en un documento especifico, la respuesta que"
        "sea corta y concisa."),
        ("user", query),
    ]
    response = llm.invoke(messages)
    return response.content


In [None]:

query="como es la codificacion automatica de la lista de espera?"
query_exp = query_expansion_response(query,llm)

ur.plot_umap_embeddings(umap_transform,projected_dataset_embeddings, retriever, query+' '+query_exp)

## Re-ranking

In [None]:
query = "como es el modelo de reconocimiento de enfermedades?"

a,b,c,d,e=ur.retrieved_documents(query
                    , retriever
                    , n_results=5
                    , reranker=True
                    , model_name_reranker='BAAI/bge-reranker-v2-m3'
                    , path_data = "data/docs/"
                    , print_results=True)

In [None]:
query = "como es el modelo de reconocimiento de enfermedades?"

ur.retrieved_documents(query
                    , retriever
                    , n_results=5
                    , reranker=False
                    , model_name_reranker='BAAI/bge-reranker-v2-m3'
                    , path_data = "data/docs/"
                    , print_results=True)

In [4]:
query = "como es el modelo de reconocimiento de enfermedades?"

a,b,c,d,e=ur.retrieved_documents(query
                    , retriever
                    , n_results=5
                    , reranker=False
                    , model_name_reranker='BAAI/bge-reranker-v2-m3'
                    , path_data = "data/docs/"
                    , print_results=False)

In [6]:
def generar_citas(a_list, d_list, e_list,use_reranker=False):
    resultado = []
    for a_eval,d_val, e_val in zip(a_list,d_list, e_list):
        cita = {}
        if use_reranker:
            if d_val == 'Imagen':
                # Buscar el primer valor de a_list que coincide con d_val
                contenido = a_eval[1]
            elif d_val == 'Table':
                contenido = a_eval[1]
            else:
                contenido = e_val.get('text', '')
        else:
            if d_val == 'Imagen':
                # Buscar el primer valor de a_list que coincide con d_val
                contenido = a_eval
            elif d_val == 'Table':
                contenido = a_eval
            else:
                contenido = e_val.get('text', '')


        page_number = e_val.get('page_number', '')
        filename = e_val.get('filename', '')


        if contenido is not None:
            
            cita['contenido'] = contenido
            cita['page_number'] = page_number
            cita['filename'] = filename
            cita['file_type'] = d_val
            resultado.append(cita)
    
    contexto = "============================== Ventana de Contexto=======================================\n\n"
    for i,cita in enumerate(resultado):
        contexto += f"----------------------------Inicio Contexto {i+1}--------------------------------\n\n"
        contexto += f"Página: {cita['page_number']}\n\n"
        contexto += f"Archivo: {cita['filename']}\n\n"
        contexto += f"Tipo: {cita['file_type']}\n\n"
        contexto += f"Contenido: {cita['contenido']}\n\n"
        
        contexto += f"----------------------------Fin Contexto {i+1}--------------------------------\n\n"
    contexto += "==============================Fin Ventana de Contexto =======================================\n\n"

    
    return resultado, contexto



resultado, contexto = generar_citas(a, d, e)



In [7]:
contexto



In [8]:
from rich.markdown import Markdown as rich_Markdown
rich_Markdown(contexto)

In [None]:
resultado[0]['filename']

In [None]:
path_datos = "data/docs/"
ur.plot_multiple_pages(path_datos + resultado[0]['filename'], resultado[0]['page_number'])