In [2]:
from langchain_community.document_loaders import PubMedLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_core.documents import Document


In [3]:
# Cargar documentos relevantes desde PubMed

loader = PubMedLoader(query="overweight OR diet OR cholesterol OR nutrition OR exercice", load_max_docs=3000)

docs = loader.load()

# Text splitting
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False
)
docs_split = text_splitter.split_documents(docs)

# print(docs_split[0])



In [4]:
def convert_and_filter_metadata(doc):
    def flatten_metadata_value(value):
        if isinstance(value, dict):
            # Convertir el diccionario en un string
            return "; ".join([f"{k}: {v}" for k, v in value.items()])
        return value
    
    # Filtrado de metadatos y diccionarios anidados
    filtered_metadata = {
        k: flatten_metadata_value(v) 
        for k, v in doc.metadata.items() 
        if isinstance(v, (str, int, float, bool, dict))
    }
    
    return Document(page_content=doc.page_content, metadata=filtered_metadata)

# Procesado de los documentos
docs_processed = [convert_and_filter_metadata(doc) for doc in docs_split]

In [7]:

# Creamos embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Creaamos una Chroma vector store
vectorstore = Chroma.from_documents(
    documents=docs_processed,
    embedding=embeddings,
    persist_directory="./chroma_db1"
)


# instancia de retreiver
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 20})