Ref: https://python.langchain.com/docs/tutorials/rag/

### Set up the model, and which embeddings and vector stores to use

In [1]:
from langchain_ollama import ChatOllama
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings # we'll locally use huggingface embeddings
from langchain_core.vectorstores import InMemoryVectorStore

# https://python.langchain.com/docs/integrations/chat/ollama/
llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0,
    # other params...
)

embeddings = HuggingFaceEndpointEmbeddings()
vector_store = InMemoryVectorStore(embeddings)

# template = ChatPromptTemplate.from_template(
#     "What happens when an unstoppable force meets an immovable object?"
# )
# chain = template | llm

# response = chain.invoke({})
# print(response)

  from .autonotebook import tqdm as notebook_tqdm


### [OPTIONAL] On kaggle, for local calls, tinyllama is recommended, and otherwise, you may use the 8B llama model as an API call

In [None]:
# from dotenv import load_dotenv
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline # to get models
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings # locally use embeddings
# embeddings = HuggingFaceEndpointEmbeddings()

# load_dotenv()

# https://python.langchain.com/docs/integrations/chat/huggingface/#huggingfacepipeline
llm = HuggingFacePipeline.from_model_id(
    model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
    ),
)

llm = ChatHuggingFace(llm=llm)


### Set up indexing

Indexing is basically how you organise your data for later use. This involves gathering it first, and then converting that to embeddings, and storing that into a vector store DB, so we are able to efficiently index it later on.

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader


# 1. get the data

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

assert len(docs) == 1 # makes sure docs contains only one document
print(f"Total characters: {len(docs[0].page_content)}")


# 2. split the document into more manageable chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)
print(f"Split blog post into {len(all_splits)} sub-documents.")


# 3. store it into a vector store

document_ids = vector_store.add_documents(documents=all_splits)
print(document_ids[:3])

USER_AGENT environment variable not set, consider setting it to identify your requests.


Total characters: 43130
Split blog post into 66 sub-documents.




['a8140ad2-bb83-4072-84a1-e4c55109de6c', '22f70386-7c12-4ea2-a888-2c04d6e45ac4', 'f3512aef-4192-4fee-a0b9-4cefb08a6bc5']


### The fun part: Information retrieval and generation

Our retrieval step simply runs a similarity search using the input question, and the generation step formats the retrieved context and original question into a prompt for the chat model.

In [4]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
)

question = "..."

retrieved_docs = vector_store.similarity_search(question)                   # retrieve
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)      # generate

chain = prompt | llm
answer = chain.invoke({"question": "What is Modular Reasoning?", "context": docs_content})

print(answer)

content='Modular Reasoning (MRKL) is a neuro-symbolic architecture for autonomous agents that uses a collection of "expert" modules, which can be neural or symbolic, to accomplish tasks. These modules are connected by a general-purpose Large Language Model (LLM) that routes inquiries to the best suitable expert module. The goal is to leverage the strengths of both symbolic and neural approaches to improve efficiency and accuracy in complex tasks.' additional_kwargs={} response_metadata={'model': 'llama3.2:3b', 'created_at': '2025-03-16T11:49:18.076984622Z', 'done': True, 'done_reason': 'stop', 'total_duration': 7140794736, 'load_duration': 2062254912, 'prompt_eval_count': 665, 'prompt_eval_duration': 2909000000, 'eval_count': 86, 'eval_duration': 2168000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)} id='run-47ab1585-a602-4c50-a04a-68ef0ca10055-0' usage_metadata={'input_tokens': 665, 'output_tokens': 86, 'total_tokens': 751}
