# Building RAG Chatbots with LangChain

## SETUP

In [1]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

## CHATBOT (NO RAG)

In [2]:
import os
from langchain_openai.chat_models import ChatOpenAI

In [3]:
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [4]:
from langchain.schema import SystemMessage, HumanMessage, AIMessage

messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage("Hi AI, how are you doint today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand string theory"),
]

In [5]:
res = llm.invoke(input=messages)
res.content

'String theory is a theoretical framework that seeks to unify the four fundamental forces of nature - gravity, electromagnetism, the weak nuclear force, and the strong nuclear force. It proposes that the most basic building blocks of the universe are not point-like particles, but rather tiny, vibrating strings. These strings can vibrate at different frequencies, giving rise to different particles and forces.\n\nOne of the key ideas in string theory is that the universe has more than the usual three spatial dimensions that we experience in everyday life. In fact, string theory requires at least 10 dimensions to be consistent. The extra dimensions are thought to be compactified, meaning they are curled up and not directly observable at our macroscopic scale.\n\nString theory has the potential to provide a unified description of all fundamental forces and particles in the universe, including gravity. However, it is still a work in progress and has not yet been experimentally confirmed. Re

In [6]:
# add latest AI response
messages.append(res)

In [7]:
messages

[SystemMessage(content='You are a helpful assistant.'),
 HumanMessage(content='Hi AI, how are you doint today?'),
 AIMessage(content="I'm great thank you. How can I help you?"),
 HumanMessage(content="I'd like to understand string theory"),
 AIMessage(content='String theory is a theoretical framework that seeks to unify the four fundamental forces of nature - gravity, electromagnetism, the weak nuclear force, and the strong nuclear force. It proposes that the most basic building blocks of the universe are not point-like particles, but rather tiny, vibrating strings. These strings can vibrate at different frequencies, giving rise to different particles and forces.\n\nOne of the key ideas in string theory is that the universe has more than the usual three spatial dimensions that we experience in everyday life. In fact, string theory requires at least 10 dimensions to be consistent. The extra dimensions are thought to be compactified, meaning they are curled up and not directly observable

In [8]:
# create a new user prompt
prompt = HumanMessage(
    content="Why do physicists believe it can produce a 'unified theory'? "
)

messages.append(prompt)
res = llm(messages)
messages.append(res)

  warn_deprecated(


In [9]:
messages

[SystemMessage(content='You are a helpful assistant.'),
 HumanMessage(content='Hi AI, how are you doint today?'),
 AIMessage(content="I'm great thank you. How can I help you?"),
 HumanMessage(content="I'd like to understand string theory"),
 AIMessage(content='String theory is a theoretical framework that seeks to unify the four fundamental forces of nature - gravity, electromagnetism, the weak nuclear force, and the strong nuclear force. It proposes that the most basic building blocks of the universe are not point-like particles, but rather tiny, vibrating strings. These strings can vibrate at different frequencies, giving rise to different particles and forces.\n\nOne of the key ideas in string theory is that the universe has more than the usual three spatial dimensions that we experience in everyday life. In fact, string theory requires at least 10 dimensions to be consistent. The extra dimensions are thought to be compactified, meaning they are curled up and not directly observable

## DEALING W/ HALLUCINATIONS

In [10]:
# create a new user prompt
prompt = HumanMessage(content="What is so special about Llama 3?")
messages.append(prompt)

res = llm(messages)
res.content

'I\'m not aware of any specific significance or special characteristics associated with "Llama 3." It\'s possible that it could be a reference to something specific in a particular context, such as a scientific study, a product name, a code name, or even a fictional character. If you provide more context or information, I may be able to help you further.'

In [11]:
# add latest AI response to messages
messages.append(res)

# now create a new user prompt
prompt = HumanMessage(content="Can you tell me about the LLMChain in LangChain?")
# add to messages
messages.append(prompt)

# send to OpenAI
res = llm(messages)
res.content

'I\'m sorry, but it seems there might be a misunderstanding or confusion with the terms you mentioned. As of my current knowledge, there is no widely recognized concept or technology known as "LLMChain" in the context of LangChain or any other related field. If you can provide more details or clarify the context, I would be happy to try to assist you further.'

In [12]:
llmchain_information = [
    "A LLMChain is the most common type of chain. It consists of a PromptTemplate, a model (either an LLM or a ChatModel), and an optional output parser. This chain takes multiple input variables, uses the PromptTemplate to format them into a prompt. It then passes that to the model. Finally, it uses the OutputParser (if provided) to parse the output of the LLM into a final format.",
    "Chains is an incredibly generic concept which returns to a sequence of modular components (or other chains) combined in a particular way to accomplish a common use case.",
    "LangChain is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model via an api, but will also: (1) Be data-aware: connect a language model to other sources of data, (2) Be agentic: Allow a language model to interact with its environment. As such, the LangChain framework is designed with the objective in mind to enable those types of applications.",
]

source_knowledge = "\n".join(llmchain_information)

In [13]:
query = "Can you tell me about the LLMChain in LangChain?"

augmented_prompt = f"""Using the contexts below, answer the query.

Contexts:
{source_knowledge}

Query: {query}"""

In [16]:
prompt = HumanMessage(content=augmented_prompt)
messages.append(prompt)

res = llm(messages)
print(res.content)

The LLMChain, within the LangChain framework, is a common type of chain that plays a crucial role in connecting various components to utilize language models effectively. Here are the key points about the LLMChain within the LangChain context:

1. **Components**: The LLMChain consists of the following components:
   - **PromptTemplate**: A template used to format multiple input variables into a prompt for the language model.
   - **Model**: The model used in the chain can be either a Large Language Model (LLM) or a ChatModel.
   - **Optional Output Parser**: An optional component that parses the output of the language model into a final format.

2. **Functionality**: The LLMChain takes multiple input variables, formats them using the PromptTemplate, and passes them to the model for processing. The model generates an output based on the input prompt, and if provided, the Output Parser can further process the model's output into a desired format.

3. **Purpose**: The LLMChain is designed

## IMPORTING THE DATA

In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train",
)

dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [3]:
dataset[0]

{'doi': '1102.0183',
 'chunk-id': '0',
 'chunk': 'High-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nTechnical Report No. IDSIA-01-11\nJanuary 2011\nIDSIA / USI-SUPSI\nDalle Molle Institute for Arti\x0ccial Intelligence\nGalleria 2, 6928 Manno, Switzerland\nIDSIA is a joint institute of both University of Lugano (USI) and University of Applied Sciences of Southern Switzerland (SUPSI),\nand was founded in 1988 by the Dalle Molle Foundation which promoted quality of life.\nThis work was partially supported by the Swiss Commission for Technology and Innovation (CTI), Project n. 9688.1 IFF:\nIntelligent Fill in Form.arXiv:1102.0183v1  [cs.AI]  1 Feb 2011\nTechnical Report No. IDSIA-01-11 1\nHigh-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nJanuary 2011\nAbs

## BUILD THE KNOWLEDGEBASE

In [4]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

In [5]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1",
)

spec

ServerlessSpec(cloud='aws', region='us-east-1')

In [8]:
import time

index_name = "llama-3-rag"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric="dotproduct",
        spec=spec,
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}