In [1]:
#focus on learning how to navigate and build useful applications using LangChain, specifically LCEL, and how to integrate different APIs together into a coherent RAG application!

In [4]:
#Task 1: Installing Required Libraries

In [6]:
# pip install -qU langchain langchain-core langchain-community langchain-openai

In [8]:
# pip install -qU qdrant-client

In [5]:
#Task 2: Set Environment Variables

In [2]:
import os
import openai
from openai import OpenAI
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [3]:
%reload_ext dotenv

In [6]:
#Task 3: Initialize a Simple Chain using LCEL

In [7]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-4o")

In [8]:
from langchain_core.prompts import ChatPromptTemplate

system_template = "You are a legendary and mythical Wizard. You speak in riddles and make obscure and pun-filled references to exotic cheeses."
human_template = "{content}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("human", human_template)
])

In [9]:
#Our first Chain

In [10]:
chain = chat_prompt | openai_chat_model

In [11]:
print(chain.invoke({"content": "Hello world!"}))

content='Greetings, traveler of the pixelated paths! As the digital dawn rises, may your journey be as smooth as brie and as adventurous as a Roquefort chase! What brings you to the realm of binary and sorcery today?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 49, 'prompt_tokens': 38, 'total_tokens': 87, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_50cad350e4', 'finish_reason': 'stop', 'logprobs': None} id='run-da687a9d-fc71-4413-a0ab-585fb999f7a3-0' usage_metadata={'input_tokens': 38, 'output_tokens': 49, 'total_tokens': 87, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


In [14]:
chain.invoke({"content" : "explain me Langchain like i am 5 year old"})

AIMessage(content="Ah, young seeker of knowledge, gather 'round and hold tight to your magical hat! \n\nImagine, if you will, a grand parade of cheese! Yes, a delightful procession where each cheese is a different flavor and brings something unique to the table, much like the many threads of a magical story. Now, picture yourself as a curious little mouse named Cheddar. You want to taste every cheese but don't know how to line them up just right for the perfect cheesy experience.\n\nLangchain is like your wise mouse friend, Monterey Jack, who helps you connect these cheeses in the best order. Each cheese represents a piece of information or a magical trick. Monterey Jack helps you figure out how to use one piece before another, making sure your journey from Muenster to Mozzarella is as delightful as discovering that the moon is made of Gouda after all! \n\nIn the world of wizards and spells, a Langchain organizes how different magical languages and abilities come together to create som

In [16]:
HUMAN_TEMPLATE = """
#CONTEXT:
{context}

QUERY:
{query}

Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, response with "I don't know"
"""

CONTEXT = """
LangChain Expression Language or LCEL is a declarative way to easily compose chains together. There are several benefits to writing chains in this manner (as opposed to writing normal code):

Async, Batch, and Streaming Support Any chain constructed this way will automatically have full sync, async, batch, and streaming support. This makes it easy to prototype a chain in a Jupyter notebook using the sync interface, and then expose it as an async streaming interface.

Fallbacks The non-determinism of LLMs makes it important to be able to handle errors gracefully. With LCEL you can easily attach fallbacks to any chain.

Parallelism Since LLM applications involve (sometimes long) API calls, it often becomes important to run things in parallel. With LCEL syntax, any components that can be run in parallel automatically are.

Seamless LangSmith Tracing Integration As your chains get more and more complex, it becomes increasingly important to understand what exactly is happening at every step. With LCEL, all steps are automatically logged to LangSmith for maximal observability and debuggability.
"""

chat_prompt = ChatPromptTemplate.from_messages([
    ("human", HUMAN_TEMPLATE)
])

chat_chain = chat_prompt | openai_chat_model

print(chat_chain.invoke({"query" : "What is LangChain Expression Language?", "context" : CONTEXT}))

content='LangChain Expression Language (LCEL) is a declarative way to easily compose chains together. It provides several benefits such as full sync, async, batch, and streaming support, enabling prototyping in Jupyter notebooks and exposing chains as async streaming interfaces. LCEL allows for easy attachment of fallbacks to handle errors gracefully due to the non-determinism of LLMs. It supports parallelism, allowing components that can run in parallel to do so automatically. Additionally, LCEL offers seamless integration with LangSmith Tracing, logging all steps for maximal observability and debuggability as chains grow more complex.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 124, 'prompt_tokens': 274, 'total_tokens': 398, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_n

In [17]:
# #
# Task #4: Implement Naive RAG using LCEL

# Now we can make a naive RAG application that will help us bridge the gap between our Pythonic implementation and a fully LangChain powered solution!


In [18]:
#Putting the R in RAG: Retrieval 101

In [19]:
# #In order to make our RAG system useful, we need a way to provide context that is most likely to answer our user's query to the LLM as additional context.

# Let's tackle an immediate problem first: The Context Window.

# All (most) LLMs have a limited context window which is typically measured in tokens. This window is an upper bound of how much stuff we can stuff in the model's input at a time.

# Let's say we want to work off of a relatively large piece of source data - like the Ultimate Hitchhiker's Guide to the Galaxy. All 898 pages of it!

#     NOTE: It is recommended you do not run the following cells, they are purely for demonstrative purposes.


In [20]:
# #context = """
# EVERY HITCHHIKER'S GUIDE BOOK
# """


In [21]:
#We can leverage our tokenizer to count the number of tokens for us!

In [22]:
# import tiktoken

# enc = tiktoken.encoding_for_model("gpt-4o")

In [23]:
#len(enc.encode(context))

In [24]:


# 636144

# The full set comes in at a whopping 636,144 tokens.

# So, we have too much context. What can we do?

# Well, the first thing that might enter your mind is: "Use a model with more context window", and we could definitely do that! However, even gpt-4-128k wouldn't be able to fit that whole text in the context window at once.

# So, we can try splitting our document up into little pieces - that way, we can avoid providing too much context.

# We have another problem now.

# If we split our document up into little pieces, and we can't put all of them in the prompt. How do we decide which to include in the prompt?!

#     NOTE: Content splitting/chunking strategies are an active area of research and iterative developement. There is no "one size fits all" approach to chunking/splitting at this moment. Use your best judgement to determine chunking strategies!

# In order to conceptualize the following processes - let's create a toy context set!
# TextSplitting aka Chunking

# We'll use the RecursiveCharacterTextSplitter to create our toy example.

# It will split based on the following rules:

#     Each chunk has a maximum size of 100 tokens
#     It will try and split first on the \n\n character, then on the \n, then on the <SPACE> character, and finally it will split on individual tokens.

# Let's implement it and see the results!


In [25]:
# import tiktoken
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# def tiktoken_len(text):
#     tokens = tiktoken.encoding_for_model("gpt-4o").encode(
#         text,
#     )
#     return len(tokens)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size = 100,
#     chunk_overlap = 0,
#     length_function = tiktoken_len,
# )

In [26]:
# chunks = text_splitter.split_text(CONTEXT)

In [27]:
#len(chunks) o/p: 3

In [28]:
# for chunk in chunks:
#   print(chunk)
#   print("----")

In [29]:


# LangChain Expression Language or LCEL is a declarative way to easily compose chains together. There are several benefits to writing chains in this manner (as opposed to writing normal code):

# Async, Batch, and Streaming Support Any chain constructed this way will automatically have full sync, async, batch, and streaming support. This makes it easy to prototype a chain in a Jupyter notebook using the sync interface, and then expose it as an async streaming interface.
# ----
# Fallbacks The non-determinism of LLMs makes it important to be able to handle errors gracefully. With LCEL you can easily attach fallbacks to any chain.

# Parallelism Since LLM applications involve (sometimes long) API calls, it often becomes important to run things in parallel. With LCEL syntax, any components that can be run in parallel automatically are.
# ----
# Seamless LangSmith Tracing Integration As your chains get more and more complex, it becomes increasingly important to understand what exactly is happening at every step. With LCEL, all steps are automatically logged to LangSmith for maximal observability and debuggability.
# ----

# As is shown in our result, we've split each section into 100 token chunks - cleanly separated by \n\n characters!


In [30]:
#While there's nothing specifically wrong with the chunking method used above - it is a naive approach that is not sensitive to specific data formats.

In [31]:

# Embeddings and Dense Vector Search

# Now that we have our individual chunks, we need a system to correctly select the relevant pieces of information to answer our query.

# This sounds like a perfect job for embeddings!

# We'll be using OpenAI's text-embedding-3 model as our embedding model today!

# Let's load it up through LangChain.


In [32]:
# from langchain_openai.embeddings import OpenAIEmbeddings

# embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [33]:
#The embedding dimension for text-embedding-3-small is 1536.

In [34]:

# Finding the Embeddings for Our Chunks

# First, let's find all our embeddings for each chunk and store them in a convenient format for later.


In [35]:
# embeddings_dict = {}

# for chunk in chunks:
#   embeddings_dict[chunk] = embedding_model.embed_query(chunk)

In [36]:
# for k,v in embeddings_dict.items():
#   print(f"Chunk - {k}")
#   print("---")
#   print(f"Embedding - Vector of Size: {len(v)}")
#   print("\n\n")

In [37]:


# Chunk - LangChain Expression Language or LCEL is a declarative way to easily compose chains together. There are several benefits to writing chains in this manner (as opposed to writing normal code):

# Async, Batch, and Streaming Support Any chain constructed this way will automatically have full sync, async, batch, and streaming support. This makes it easy to prototype a chain in a Jupyter notebook using the sync interface, and then expose it as an async streaming interface.
# ---
# Embedding - Vector of Size: 1536



# Chunk - Fallbacks The non-determinism of LLMs makes it important to be able to handle errors gracefully. With LCEL you can easily attach fallbacks to any chain.

# Parallelism Since LLM applications involve (sometimes long) API calls, it often becomes important to run things in parallel. With LCEL syntax, any components that can be run in parallel automatically are.
# ---
# Embedding - Vector of Size: 1536



# Chunk - Seamless LangSmith Tracing Integration As your chains get more and more complex, it becomes increasingly important to understand what exactly is happening at every step. With LCEL, all steps are automatically logged to LangSmith for maximal observability and debuggability.
# ---
# Embedding - Vector of Size: 1536



# Okay, great. Let's create a query - and then embed it!


In [38]:
# query = "Can LCEL help take code from the notebook to production?"

# query_vector = embedding_model.embed_query(query)
# print(f"Vector of Size: {len(query_vector)}")

In [39]:


# Vector of Size: 1536

# Now, let's compare it against each existing chunk's embedding by using cosine similarity.


In [40]:
# import numpy as np
# from numpy.linalg import norm

# def cosine_similarity(vec_1, vec_2):
#   return np.dot(vec_1, vec_2) / (norm(vec_1) * norm(vec_2))

In [41]:
# max_similarity = -float('inf')
# closest_chunk = ""

# for chunk, chunk_vector in embeddings_dict.items():
#   cosine_similarity_score = cosine_similarity(chunk_vector, query_vector)

#   if cosine_similarity_score > max_similarity:
#     closest_chunk = chunk
#     max_similarity = cosine_similarity_score

# print(closest_chunk)
# print(max_similarity)

In [42]:


# LangChain Expression Language or LCEL is a declarative way to easily compose chains together. There are several benefits to writing chains in this manner (as opposed to writing normal code):

# Async, Batch, and Streaming Support Any chain constructed this way will automatically have full sync, async, batch, and streaming support. This makes it easy to prototype a chain in a Jupyter notebook using the sync interface, and then expose it as an async streaming interface.
# 0.537298487051912

# And we get the expected result, which is the passage that specifically mentions prototyping in a Jupyter Notebook!
# Creating a Retriever

# Now that we have an idea of how we're getting our most relevant information - let's see how we could create a pipeline that would automatically extract the closest chunk to our query and use it as context for our prompt!

# First, we'll wrap the above in a helper function!


In [43]:
# def retrieve_context(query, embeddings_dict, embedding_model):
#   query_vector = embedding_model.embed_query(query)
#   max_similarity = -float('inf')
#   closest_chunk = ""

#   for chunk, chunk_vector in embeddings_dict.items():
#     cosine_similarity_score = cosine_similarity(chunk_vector, query_vector)

#     if cosine_similarity_score > max_similarity:
#       closest_chunk = chunk
#       max_similarity = cosine_similarity_score

#   return closest_chunk

In [45]:
#Now, let's add it to our pipeline!


In [46]:
# def simple_rag(query, embeddings_dict, embedding_model, chat_chain):
#   context = retrieve_context(query, embeddings_dict, embedding_model)

#   response = chat_chain.invoke({"query" : query, "context" : context})

#   return_package = {
#       "query" : query,
#       "response" : response,
#       "retriever_context" : context
#   }

#   return return_package

In [47]:
#simple_rag("Can LCEL help take code from the notebook to production?", embeddings_dict, embedding_model, chat_chain)

In [48]:
# #o/p {'query': 'Can LCEL help take code from the notebook to production?',
#  'response': AIMessage(content='Yes, LCEL can help take code from the notebook to production. Since any chain constructed using LCEL will automatically have full sync, async, batch, and streaming support, this makes it easy to prototype a chain in a Jupyter notebook using the sync interface and then expose it as an async streaming interface for production.', response_metadata={'token_usage': {'completion_tokens': 64, 'prompt_tokens': 152, 'total_tokens': 216}, 'model_name': 'gpt-4o', 'system_fingerprint': 'fp_319be4768e', 'finish_reason': 'stop', 'logprobs': None}, id='run-feaeed29-4db3-4e7a-b9ef-2f512437537e-0', usage_metadata={'input_tokens': 152, 'output_tokens': 64, 'total_tokens': 216}),
#  'retriever_context': 'LangChain Expression Language or LCEL is a declarative way to easily compose chains together. There are several benefits to writing chains in this manner (as opposed to writing normal code):\n\nAsync, Batch, and Streaming Support Any chain constructed this way will automatically have full sync, async, batch, and streaming support. This makes it easy to prototype a chain in a Jupyter notebook using the sync interface, and then expose it as an async streaming interface.'}

In [49]:
# What does LCEL do that makes it more reliable at scale?

In [50]:
# ANSWER

#     Streaming so that users begin to see responses within a consistent timeframe.
#     Protection against LLM failures. So, using parallism to make multiple calls in the event that one doesn't return a correctly formatted respsone.
#     Information architecture so that "bad docs" don't cause retrieval issues which will distact from answer quality.
