In [1]:
import os

import bs4
import dotenv
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter

dotenv.load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# RAG against my machine

In [69]:
# repo_id = "Rijgersberg/GEITje-7B"
# repo_id = "GroNLP/bert-base-dutch-cased"
repo_id = "./ov_model_dir"

# llm = HuggingFaceEndpoint(repo_id=repo_id)
# llm = HuggingFacePipeline.from_model_id(model_id=repo_id,
#                                         task="text-generation",
#                                         pipeline_kwargs={"max_new_tokens": 30},
#                                         device_map="auto")

ov_config = {"KV_CACHE_PRECISION": "u8",
             "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32",
             "PERFORMANCE_HINT": "LATENCY",
             "NUM_STREAMS": "1",
             "CACHE_DIR": ""}

llm = HuggingFacePipeline.from_model_id(model_id=repo_id,
                                        task="text-generation",
                                        backend="openvino",
                                        model_kwargs={"device": "CPU", "ov_config": ov_config},
                                        pipeline_kwargs={"max_new_tokens": 100})

Compiling the model to CPU ...


In [70]:
# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                       bs_kwargs={"parse_only": bs4_strainer})
docs = loader.load()

print(docs[0].page_content[:500])  # Example of the first 500 characters of the first document



      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In


In [71]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
splits = text_splitter.split_documents(docs)

print(f"{len(splits)} splits in total")
print(f"Metadata: {splits[4].metadata}")
print(f"Contents:\n\n{splits[4].page_content}")

66 splits in total
Metadata: {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'start_index': 2837}
Contents:

Another quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.
Self-Reflection#
Self-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting p

In [72]:
# sentence-transformers/all-MiniLM-L6-v2
vectorstore = Chroma.from_documents(documents=splits, embedding=HuggingFaceEmbeddings())

In [73]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("What are the approaches to Task Decomposition?")

print(f"{len(retrieved_docs)} retrieved docs")
print(f"Contents:\n\n{retrieved_docs[0].page_content[:100]}")

6 retrieved docs
Contents:

Task decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\n1.", "What are


In [74]:
# prompt = hub.pull("rlm/rag-prompt")

from langchain_core.prompts import ChatPromptTemplate


# prompt = ChatPromptTemplate.from_messages(
    # ("human", "Je bent een assistent voor het beantwoorden van vragen. Gebruik de volgende stukjes opgehaalde context om de vraag te beantwoorden. Als je het antwoord niet weet, zeg dan gewoon dat je het niet weet. Gebruik maximaal drie zinnen en houd het antwoord beknopt.\nVraag: {question} \nContext: {context} \nAntwoord:"),)

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke({"context": "filler context", "question": "filler question"}).to_messages()


example_messages

[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: filler question \nContext: filler context \nAnswer:")]

In [75]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [76]:
rag_chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()}
             | prompt
             | llm
             | StrOutputParser())

In [77]:
# from langchain_core.prompts import PromptTemplate


# chain = prompt | llm
# question = "Waar gaat deze tekst over?"



# print(chain.invoke({"question": question, "context": retriever}))

In [79]:
for chunk in rag_chain.stream("Wat is taak decompositie?"):
    print(chunk, end="", flush=True)

Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: Wat is taak decompositie? 
Context: ANNOY (Approximate Nearest Neighbors Oh Yeah): The core data structure are random projection trees, a set of binary trees where each non-leaf node represents a hyperplane splitting the input space into half and each leaf stores one data point. Trees are built independently and at random, so to some

To avoid overfitting, CoH adds a regularization term to maximize the log-likelihood of the pre-training dataset. To avoid shortcutting and copying (because there are many common words in feedback sequences), they randomly mask 0% - 5% of past tokens during training.
The training dataset in their experiments is a combination of WebGPT comparisons, summarization from human feedback and human preference data