# Langchain practise


## SETUP


In [125]:
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv, find_dotenv
import os

In [126]:
# load the apikeys
_ = load_dotenv(find_dotenv())

# openai
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# langchain
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

## Part 1: Overview


In [200]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

### Indexing the documents


In [128]:
# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [129]:
# split the document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [130]:
# embed the splits of documents
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),  # openai embeddings
)

In [131]:
# define the document retriever
retriever = vectorstore.as_retriever()

In [132]:
retriever.get_relevant_documents

<bound method BaseRetriever.get_relevant_documents of VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x28d25e1d0>)>

### Retrieval and Generation


In [133]:
# get prompt
prompt = hub.pull("rlm/rag-prompt")
prompt

# question -> query by user
# context -> retreived documents

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [201]:
# define the llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0
)

In [202]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [203]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("What is Task Decomposition?")

TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class 'langchain_core.runnables.base.RunnableSequence'>

## Part 2: Indexing


In [137]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [139]:
# count number of tokens per model
import tiktoken


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


num_tokens_from_string(question, "cl100k_base")

8

In [143]:
# openai embedding model
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
query_result = embedding.embed_query(question)  # question from user
document_result = embedding.embed_query(document)  # ref document / split(s)

len(query_result)  # show the embedding size of the query per model

1536

In [145]:
# find how similar the query to a document

import numpy as np


def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # returns the number between 0 to 1 to show how similar
    return dot_product / (norm_vec1 * norm_vec2)


similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)  # closer to 1 is very similar

Cosine Similarity: 0.8805900727152066


In [146]:
# document loaders
# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

In [148]:
# split document into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=50
)

# make splits
splits = text_splitter.split_documents(blog_docs)

In [149]:
# get chunk embeddings and store them into vector store
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=splits, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY)
)

retriever = vectorstore.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x28d305b50>)

## Part 3: Retrieval


In [166]:
# index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=splits, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY)
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x28d9b97d0>, search_kwargs={'k': 2})

In [167]:
docs = retriever.get_relevant_documents("What is Task Decomposition?")
docs

[Document(page_content='Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}),
 Document(page_content='Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a

In [168]:
len(docs)

2

## Part 4:


In [196]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# define the prompt template
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'))])

In [197]:
# define LLM
llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0, model_name="gpt-3.5-turbo")
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x2970dde10>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x2970e9790>, temperature=0.0, openai_api_key='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', openai_proxy='')

In [198]:
# chain
chain = prompt | llm
chain

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'))])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x2970dde10>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x2970e9790>, temperature=0.0, openai_api_key='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', openai_proxy='')

In [199]:
# Run
chain.invoke({"context": docs, "question": "What is Task Decomposition?"})

AIMessage(content='Task Decomposition is a technique that involves breaking down complex tasks into smaller and simpler steps, allowing the agent to better understand and plan for each individual step. This technique is often used to enhance model performance on complex tasks by transforming big tasks into multiple manageable tasks.')

In [204]:
from langchain import hub

prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [205]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [206]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Task Decomposition?")

TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class 'langchain_core.vectorstores.VectorStoreRetriever'>