In [12]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFaceEmbeddings
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_chroma import Chroma
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
    huggingfacehub_api_token=hf_token
)
llama = ChatHuggingFace(llm=llm)

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\piamp\.cache\huggingface\token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load

In [61]:
markdown_loader = UnstructuredMarkdownLoader("content/api_plazos_compartidos.md", mode="elements")
data = markdown_loader.load()
print(len(data[0].page_content))
print(data[0].page_content[:500])
print(data[0].metadata)

26
Fixed-Term Shared Deposits
{'source': 'content/api_plazos_compartidos.md', 'category_depth': 0, 'last_modified': '2024-07-31T20:45:43', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': 'content', 'filename': 'api_plazos_compartidos.md', 'category': 'Title'}


In [62]:
for doc in data:
    print(doc)

page_content='Fixed-Term Shared Deposits' metadata={'source': 'content/api_plazos_compartidos.md', 'category_depth': 0, 'last_modified': '2024-07-31T20:45:43', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': 'content', 'filename': 'api_plazos_compartidos.md', 'category': 'Title'}
page_content='Description' metadata={'source': 'content/api_plazos_compartidos.md', 'category_depth': 1, 'last_modified': '2024-07-31T20:45:43', 'languages': ['eng'], 'parent_id': 'a28cca33de3ece6f894f77c8c3d7a582', 'filetype': 'text/markdown', 'file_directory': 'content', 'filename': 'api_plazos_compartidos.md', 'category': 'Title'}
page_content='Fixed-Term Shared Deposits is a comprehensive backend system designed to manage fixed-term deposit accounts. It provides a robust API for creating, updating, and retrieving deposit information, calculating daily interest rates, and handling user authentication and authorization.' metadata={'source': 'content/api_plazos_compartidos.md', 'last_modi

In [63]:
filtered_data = filter_complex_metadata(data, allowed_types=(str, int, float, bool))
print(filtered_data[0].metadata)

{'source': 'content/api_plazos_compartidos.md', 'category_depth': 0, 'last_modified': '2024-07-31T20:45:43', 'filetype': 'text/markdown', 'file_directory': 'content', 'filename': 'api_plazos_compartidos.md', 'category': 'Title'}


### Split

In [64]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0, add_start_index=True
)
all_splits = text_splitter.split_documents(filtered_data)

print(len(all_splits))
print(len(all_splits[0].page_content))
print(all_splits[0].metadata)

75
26
{'source': 'content/api_plazos_compartidos.md', 'category_depth': 0, 'last_modified': '2024-07-31T20:45:43', 'filetype': 'text/markdown', 'file_directory': 'content', 'filename': 'api_plazos_compartidos.md', 'category': 'Title', 'start_index': 0}


In [65]:
for doc in all_splits:
    print(doc)

page_content='Fixed-Term Shared Deposits' metadata={'source': 'content/api_plazos_compartidos.md', 'category_depth': 0, 'last_modified': '2024-07-31T20:45:43', 'filetype': 'text/markdown', 'file_directory': 'content', 'filename': 'api_plazos_compartidos.md', 'category': 'Title', 'start_index': 0}
page_content='Description' metadata={'source': 'content/api_plazos_compartidos.md', 'category_depth': 1, 'last_modified': '2024-07-31T20:45:43', 'parent_id': 'a28cca33de3ece6f894f77c8c3d7a582', 'filetype': 'text/markdown', 'file_directory': 'content', 'filename': 'api_plazos_compartidos.md', 'category': 'Title', 'start_index': 0}
page_content='Fixed-Term Shared Deposits is a comprehensive backend system designed to manage fixed-term deposit accounts. It provides a robust API for creating, updating, and retrieving deposit information, calculating daily interest rates, and handling user authentication and authorization.' metadata={'source': 'content/api_plazos_compartidos.md', 'last_modified': '

### Store

In [66]:
vectorstore = Chroma.from_documents(
    all_splits,
    embedding=HuggingFaceEmbeddings(),
)

## RAG

### Retrieve

In [67]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("How do I Create An Entity")

print(len(retrieved_docs))
for doc in retrieved_docs:
    print(doc.page_content)

6
3. Create Entities within a Term
3. Create Entities within a Term
3. Create Entities within a Term
Create Entidad
Create Entidad
Create Entidad


In [6]:
message_template = """
Answer questions using the provided context only.

{question}

Context:
{context}
"""
prompt = ChatPromptTemplate.from_messages([("human", message_template)])

In [39]:
historial = []

def handle_message(message):
    chain = {"context": retriever, "question": RunnablePassthrough()} | prompt
    chained = chain.invoke(message)
    historial.append(chained.to_messages()[0])
    response = llama.invoke(historial)
    historial.append(response)
    return response.content

In [41]:
while True:
    message = input("You: ")
    if message == "exit":
        break

    response = handle_message(message)
    print("Bot:", response)

[HumanMessage(content="\nAnswer questions using the provided context only.\n\nhola\n\nContext:\n[Document(metadata={'row': 286, 'source': 'content/cofee.csv'}, page_content='date: 2024-04-14\\ndatetime: 2024-04-14 12:29:06.877\\ncash_type: cash\\ncard: \\nmoney: 30.0\\ncoffee_name: Cortado'), Document(metadata={'row': 285, 'source': 'content/cofee.csv'}, page_content='date: 2024-04-14\\ndatetime: 2024-04-14 12:27:08.951\\ncash_type: cash\\ncard: \\nmoney: 30.0\\ncoffee_name: Cortado'), Document(metadata={'row': 83, 'source': 'content/cofee.csv'}, page_content='date: 2024-03-11\\ndatetime: 2024-03-11 11:24:51.565\\ncash_type: cash\\ncard: \\nmoney: 30.0\\ncoffee_name: Cortado'), Document(metadata={'row': 138, 'source': 'content/cofee.csv'}, page_content='date: 2024-03-20\\ndatetime: 2024-03-20 11:41:16.403\\ncash_type: card\\ncard: ANON-0000-0000-0012\\nmoney: 28.9\\ncoffee_name: Cortado'), Document(metadata={'row': 220, 'source': 'content/cofee.csv'}, page_content='date: 2024-04-03\\nd