Ref: https://python.langchain.com/docs/tutorials/rag/

### **[LOCAL]** Set up the model, and which embeddings and vector stores to use

In [None]:
from langchain_ollama import ChatOllama
from langchain_huggingface.embeddings import HuggingFaceEmbeddings # local
from langchain_core.vectorstores import InMemoryVectorStore
from dotenv import load_dotenv
load_dotenv()

# https://python.langchain.com/docs/integrations/chat/ollama/
llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0,
    # other params...
)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vector_store = InMemoryVectorStore(embeddings)

# template = ChatPromptTemplate.from_template(
#     "What happens when an unstoppable force meets an immovable object?"
# )
# chain = template | llm

# response = chain.invoke({})
# print(response)

<hr>

### **[API CALLS]** On kaggle, for local calls, tinyllama is recommended, and otherwise, you may use the 3B llama model as an API call

In [None]:
!pip install accelerate \
            beautifulsoup4 \
            huggingface_hub \
            langchain \
            langchain-community \
            langchain-huggingface \
            python-dotenv \
            requests \
            sentence_transformers

In [None]:
from kaggle_secrets import UserSecretsClient
import os

user_secrets = UserSecretsClient()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = user_secrets.get_secret("HUGGINGFACEHUB_API_TOKEN")

In [None]:
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings # calls the api
from langchain_core.vectorstores import InMemoryVectorStore

load_dotenv()

# https://python.langchain.com/docs/integrations/chat/huggingface/#huggingfacepipeline
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.2-3B-Instruct",
    task="text-generation",
    temperature=0,
    # max_new_tokens=512,
    # do_sample=False,
    # repetition_penalty=1.03,
)

llm = ChatHuggingFace(llm=llm)
embeddings = HuggingFaceEndpointEmbeddings()
vector_store = InMemoryVectorStore(embeddings)


<hr>

### Set up indexing

Indexing is basically how you organise your data for later use. This involves gathering it first, and then converting that to embeddings, and storing that into a vector store DB, so we are able to efficiently index it later on.

In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader


# 1. get the data

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

# import pickle
# # ## store
# # with open("assets/loaded-docs.bin", "wb") as f:
# #     pickle.dump(docs, f)

# ## load
# with open("assets/loaded-docs.bin", "rb") as f:
#     docs = pickle.load(f)


assert len(docs) == 1 # makes sure docs contains only one document
print(f"Total characters: {len(docs[0].page_content)}")


# 2. split the document into more manageable chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)
print(f"Split blog post into {len(all_splits)} sub-documents.")


# 3. store it into a vector store

document_ids = vector_store.add_documents(documents=all_splits)
print(document_ids[:3])

### The fun part: Information retrieval and generation

Our retrieval step simply runs a similarity search using the input question, and the generation step formats the retrieved context and original question into a prompt for the chat model.

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
)

# question = "what is your opinion on pineapple pizza"
question = "What is Modular Reasoning?"

retrieved_docs = vector_store.similarity_search(question)                   # retrieve
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)      # generate

chain = prompt | llm
answer = chain.invoke({"question": question, "context": docs_content})

print(answer)