# RAG: retrieval Augmented Generation

In [1]:
! pip install langchain_community tiktoken langchainhub chromadb langchain sentence-transformers llama-cpp-python

Collecting langchain_community
  Downloading langchain_community-0.0.28-py3-none-any.whl.metadata (8.3 kB)
Collecting tiktoken
  Downloading tiktoken-0.6.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.15-py3-none-any.whl.metadata (621 bytes)
Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain
  Downloading langchain-0.1.12-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl.metadata (11 kB)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.57.tar.gz (36.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.9/36.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (

`(2) LangSmith`

https://docs.smith.langchain.com/

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = <langsmith-key>

## Part 1: Overview

### References:
- [RAG quickstart](https://python.langchain.com/docs/use_cases/question_answering/quickstart)
- [WebBaseLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html)
- [SoupStrainer](https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=SoupStrainer#soupstrainer)
- [Recursive Character Text Splitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)
- [Chroma vector database](https://python.langchain.com/docs/integrations/vectorstores/chroma)
- [LamaCpp Git repo](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file)
- [LamaCpp with Langchain](https://python.langchain.com/docs/integrations/llms/llamacpp)
- [LamaCpp docs](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama)
- [Callbacks](https://python.langchain.com/docs/modules/callbacks/)

In [None]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_core.language_models import LLM
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

#### INDEXING ####

# Load Documents
# we are using 'parse_only' to extract from the web page only the parts that are interesting for us
# If you open the source code of the page https://lilianweng.github.io/posts/2023-06-23-agent/, you will find three classes are important for us:
# the post-header, the post-title, and the post-content.
# Thus, we will only extract these.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
len(docs)

In [None]:
# Split
# Initialize a recursive character text splitter with a chunk size of 1000 and a chunk overlap of 200.
text_splitter = ...
splits = text_splitter.split_documents(docs)

# Embed
# Load the 'all-mpnet-base-v2' sentence transformer's embedder using the HuggingFaceEmbeddings, put the model_kwargs parameter to 'model_kwargs_value'
# 'all-mpnet-base-v2' takes as input a sentence and outputs an embedding of dimension 768
model_kwargs_value = {'device': 'cpu'}
embedding = ...
# Create a Chroma vector database from the splitted documents, 'splits', and the embedding model: 'embedding'
vectorstore = ...

# Initializing the retriever
retriever = vectorstore.as_retriever()

Downloading the LLM model Mistral 7B from the

In [None]:
mkdir model

In [None]:
!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_S.gguf?download=true -O model/mistral-7b-instruct-v0.2.Q4_K_S.gguf

In [None]:
#### RETRIEVAL and GENERATION ####

# Prompt
# here we are taking a previously created prompt and we are using it
# The prompt can be found here: https://smith.langchain.com/hub/rlm/rag-prompt-mistral
prompt = hub.pull("rlm/rag-prompt-mistral")
print('The prompt used here is:\n', prompt)

In [None]:
# LLM
# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
model_path = 'model/mistral-7b-instruct-v0.2.Q4_K_S.gguf'
# temp = the temperature of the model; the lower the value, the more deterministic the model is, i.e., for Temp=0, the same question asked several time will get you the same answer.
temp = .1
# maxtokens = the maximum number of tokens to be generated in the model’s response
maxtokens = 1000
# top_p = It is used to control the diversity of the predictions, meaning that it selects the most probable tokens whose cumulative probability exceeds a given threshold.
top_p = 1
# n_ctx = # Text context size; it is the max nbr of tokens that the model can account for when processing a response. This includes the prompt, and the response itself --> the context needs to be set large enough for both the question and answer.
n_ctx = 2048

# Initialize a LlamaCpp LLM model by setting the params:
# model_path to the previously defined model_path
# the temperature to 0.1
# the max_tokens to 1000
# top_p = 1
# the context size to 2048
# callback_manager to the previosuly defined callback_manager
# verbose to True; verbose is required to pass to the callback manager
llm = ...

In [None]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Question
rag_chain.invoke("What is Task Decomposition?")

In [None]:
rag_chain.invoke("Who is Beyonce?")

## Part 2: Indexing

In [None]:
# Documents
question = "What kinds of classes do I like?"
document = "My favorite class is NLP."

- [Count tokens for OpenAI-based LLM, i.e., GPT3.5, GPT4, etc.](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) considering [~4 char / token](https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them)
- [Count tokens for Open source LLMs, i.e., like Mistral 7B, llama, etc.](https://huggingface.co/docs/transformers/v4.38.2/en/model_doc/auto#transformers.AutoTokenizer)

In [None]:
import tiktoken

def num_tokens_from_string_openAI(string: str, encoding_name: str) -> int:
    """
    Returns the number of tokens in a text string.

    Args:
    -----
      string: str
        Input string text
      encoding_name: str
        Encoding Type
    Returns:
    --------
      num_tokens: int
        the number of tokens in a text string.
      """
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string_openAI(question, "cl100k_base")

In [None]:
from transformers import LlamaTokenizerFast

def num_tokens_from_string(string: str) -> int:
  """
    Returns the number of tokens in a text string.

    Args:
    -----
      string: str
        Input string text
    Returns:
    --------
      num_tokens: int
        the number of tokens in a text string.

    NB: To get the tokenized text, use the command: tokenized_text = tokenizer.tokenize(string)
      """
  tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
  tokenized_text_as_ids = tokenizer.encode(string)
  num_tokens = len(tokenized_text_as_ids)
  return num_tokens
num_tokens_from_string(question)

In [None]:
# Load the 'all-mpnet-base-v2' sentence transformer's embedder using the HuggingFaceEmbeddings, put the model_kwargs parameter to 'model_kwargs_value'
embd =
# embed the question using the method 'embed_query'
query_result = ...
# embed the document using the method 'embed_query'
document_result = ...
len(query_result)

[Cosine similarity](https://platform.openai.com/docs/guides/embeddings/frequently-asked-questions) is reccomended (1 indicates identical) for OpenAI embeddings.

In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Compute the cosine similarity between the embeddings of the question and document
similarity = ...
print("Cosine Similarity:", similarity)

[Document Loaders](https://python.langchain.com/docs/integrations/document_loaders/)

In [None]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

[Splitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)

This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""].

It takes in the large text then tries to split it by the first character \n\n. If the first split by \n\n is still large then it moves to the next character which is \n and tries to split by it. If it is still larger than our specified chunk size it moves to the next character in the set until we get a split that is less than our specified chunk size.

This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [None]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Initialize a recursive character text splitter with a chunk size of 300 and a chunk overlap of 50.
text_splitter = ...

# Split the blog_docs
splits = ...

[Vectorstores](https://python.langchain.com/docs/integrations/vectorstores/)

In [None]:
# Index
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=embd)

retriever = vectorstore.as_retriever()

## Part 3: Retrieval

In [None]:
# Index
from langchain_community.vectorstores import Chroma
# Create a Chroma vector database from the splitted documents, 'splits', and the embedding model: 'embd'
vectorstore = ...
# Initialize the retriever by setting the parameter serach_kwargs to {'k':3}
# k = the number of top relevant documents to be retrieved from the vector database
retriever = ...

In [None]:
# use the method get_relevant_documents to find the releveant documents of the query
query = "What is Task Decomposition?"
docs = ...

In [None]:
len(docs)

In [None]:
docs

## Part 4: Generation

![Screenshot 2024-02-12 at 1.37.38 PM.png](attachment:f9b0e284-58e4-4d33-9594-2dad351c569a.png)

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
# Let us modify the https://smith.langchain.com/hub/rlm/rag-prompt-mistral prompt to something more customized
template = """
<s> [INST] You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, precise that the answer is based on your own knowledge and not the context.
Be as much detailed as possible.
[/INST] </s>

[INST] Question: {question}
Context: {context}
Answer: [/INST]
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:
# LLM

# Initialize a LlamaCpp LLM model similar to the one previously initialized
llm = ...

In [None]:
# Chain
# create a chain of only 2 steps : prompt and then llm
chain = ...

In [None]:
# Run
chain.invoke({"context":docs,"question":"What is Task Decomposition?"})

[RAG chains](https://python.langchain.com/docs/expression_language/get_started#rag-search-example)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# Initialize a rag chain with:
# 1 - the context and question (without any format_docs)
# 2 - the prompt
# 3 - the llm
# 4 - the output parser
# PS: check the chain above in the overview section
rag_chain = ...

In [None]:
query = "What is Task Decomposition?"
# Test your rag chain with the query
...

In [None]:
query2 = "Who is Beyonce?"
# Test your rag chain with the query2
...