In [10]:
# Install core libraries
!pip install langchain langchain_community langchain_core langchain-huggingface
!pip install transformers torch accelerate bitsandbytes
!pip install sentence-transformers faiss-cpu pypdf

# Install advanced components
!pip install rank_bm25
!pip install langchain-cohere  # For reranker, requires a free trial API key
!pip install langgraph

# Install evaluation libraries
!pip install evaluate rouge_score nltk
!pip install chromadb

Successfully installed backoff-2.2.1 bcrypt-4.3.0 chromadb-1.0.20 coloredlogs-15.0.1 durationpy-0.10 httptools-0.6.4 humanfriendly-10.0 kubernetes-33.1.0 mmh3-5.2.0 onnxruntime-1.22.1 opentelemetry-exporter-otlp-proto-common-1.36.0 opentelemetry-exporter-otlp-proto-grpc-1.36.0 opentelemetry-proto-1.36.0 overrides-7.7.0 posthog-5.4.0 pybase64-1.4.2 pypika-0.48.9 uvloop-0.21.0 watchfiles-1.1.0


In [21]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnablePassthrough
from langchain.schema.document import Document
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere.rerank import CohereRerank
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from sklearn.metrics import dcg_score, ndcg_score
import os
import nltk
import warnings
warnings.filterwarnings("ignore")
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
from transformers import pipeline

In [4]:
# TEMPLATE Configuration
CONFIG = {
    "llm": {
        "model_id": "HuggingFaceH4/zephyr-7b-beta", # A great, lightweight model for Colab
        # "model_id": "mistralai/Mistral-7B-Instruct-v0.2", # Another excellent option
    },
    "embedding": {
        "model_id": "sentence-transformers/all-MiniLM-L6-v2",
    },
    "retriever": {
        "vector_store": "faiss",
        "k": 5, # Number of documents to retrieve
    },
    "reranker": {
        # Using Cohere's free-for-dev reranker for high quality.
        # Alternatively, a local cross-encoder could be used.
        "model_id": "cohere-rerank-english-v3.0",
        "top_n": 2, # Number of documents to keep after reranking
    },
    "text_splitter": {
        "chunk_size": 1000,
        "chunk_overlap": 100,
    },
    "data": {
        "source": "web", # Can be 'pdf', 'web', 'txt'
        "path": "https://lilianweng.github.io/posts/2023-06-23-agent/", # Example blog post
    },
}

In [7]:
def get_llm(config):
    """Initializes and returns the LLM for inference."""
    # Configuration for 4-bit quantization for memory efficiency
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(config["llm"]["model_id"])
    model = AutoModelForCausalLM.from_pretrained(
        config["llm"]["model_id"],
        quantization_config=quantization_config,
        device_map="auto", # Automatically map to GPU if available
        torch_dtype=torch.float16,
    )

    # Create a Hugging Face pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.15
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

# Initialize the LLM
llm = get_llm(CONFIG)
print("LLM Initialized Successfully.")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Device set to use cuda:0


LLM Initialized Successfully.


In [8]:
def load_and_split_documents(config):
    """Loads documents from the specified source and splits them into chunks."""
    path = config["data"]["path"]
    if config["data"]["source"] == "web":
        loader = WebBaseLoader(path)
    elif config["data"]["source"] == "pdf":
        loader = PyPDFLoader(path)
    else:
        raise ValueError("Unsupported data source")

    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config["text_splitter"]["chunk_size"],
        chunk_overlap=config["text_splitter"]["chunk_overlap"]
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

# Load and split documents
documents_chunks = load_and_split_documents(CONFIG)
print(f"Loaded and split {len(documents_chunks)} chunks.")

Loaded and split 64 chunks.


In [11]:
def create_vector_store(chunks, config):
    """Creates a Chroma vector store from document chunks."""
    embeddings = HuggingFaceEmbeddings(model_name=config["embedding"]["model_id"])
    # Updated to use Chroma
    vector_store = Chroma.from_documents(chunks, embeddings)
    return vector_store

# Create the vector store
vector_store = create_vector_store(documents_chunks, CONFIG)
print("Vector Store created successfully with Chroma DB.")

Vector Store created successfully with Chroma DB.


In [15]:
# You need a Cohere API key for this. Get a free trial key from dashboard.cohere.com
os.environ["COHERE_API_KEY"] = "LLPo5KmMNg5BwzC7Xpouw3NekKUW9kFl5uULqxjn" # Replace with your key

def create_retriever(vector_store, config):
    """Creates a standard retriever and optionally a reranking retriever."""
    # 1. Base Retriever
    base_retriever = vector_store.as_retriever(search_kwargs={"k": config["retriever"]["k"]})

    # 2. Reranker (Optional - uncomment if you have access to a working reranker model)
    # try:
    #     reranker = CohereRerank(model=config["reranker"]["model_id"], top_n=config["reranker"]["top_n"])
    #     # 3. Contextual Compression Retriever
    #     compression_retriever = ContextualCompressionRetriever(
    #         base_compressor=reranker, base_retriever=base_retriever
    #     )
    #     print("Retriever with Reranker created.")
    #     return compression_retriever
    # except Exception as e:
    #     print(f"Could not create reranker: {e}")
    #     print("Using base retriever without reranking.")
    #     return base_retriever

    # Using base retriever without reranking due to reranker error
    print("Using base retriever without reranking.")
    return base_retriever


# Create the advanced retriever
retriever = create_retriever(vector_store, CONFIG)

Using base retriever without reranking.


In [13]:
# Define the desired structured output
class Answer(BaseModel):
    answer: str = Field(description="The final, detailed answer to the user's question.")
    sources: List[str] = Field(description="List of source document snippets used to generate the answer.")
    confidence_score: float = Field(description="A score from 0.0 to 1.0 indicating the model's confidence in its answer.")

# Create the JSON output parser
output_parser = JsonOutputParser(pydantic_object=Answer)

# Define the RAG prompt template
RAG_PROMPT_TEMPLATE = """
<|system|>
You are an expert Question-Answering assistant. Your goal is to provide accurate, fact-grounded answers based exclusively on the provided context.
Do not add any information that is not present in the context.
If the context does not contain the answer, state that you cannot answer the question with the given information.
Format your response as a JSON object with the following keys: "answer", "sources", "confidence_score".
"sources" should be a list of the exact text snippets from the context that justify your answer.
</|system|>
<|user|>
CONTEXT:
{context}

QUESTION:
{question}

Return a JSON object with the answer.
</|user|>
<|assistant|>
"""

rag_prompt = PromptTemplate(
    template=RAG_PROMPT_TEMPLATE,
    input_variables=["context", "question"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

In [23]:
def format_docs(docs: List[Document]) -> str:
    """Formats the retrieved documents into a single string."""
    return "\n\n".join(doc.page_content for doc in docs)

# Create the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | output_parser # Re-enable output_parser
)

# Test the chain
question = "What are the core components of an LLM-powered autonomous agent system?"
response = rag_chain.invoke(question)

print("--- RAG Response ---")
print(f"Answer: {response['answer']}")
print(f"Confidence: {response['confidence_score']}")
print("Sources:")
for source in response['sources']:
    print(f"- {source[:100]}...")

--- RAG Response (Raw LLM Output) ---

<|system|>
You are an expert Question-Answering assistant. Your goal is to provide accurate, fact-grounded answers based exclusively on the provided context.
Do not add any information that is not present in the context.
If the context does not contain the answer, state that you cannot answer the question with the given information.
Format your response as a JSON object with the following keys: "answer", "sources", "confidence_score".
"sources" should be a list of the exact text snippets from the context that justify your answer.
</|system|>
<|user|>
CONTEXT:
LLM Powered Autonomous Agents | Lil'Log







































Lil'Log

















|






Posts




Archive




Search




Tags




FAQ









      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


 


Table of Contents



Agent System Overview

Component One: Planning

Task Decomposition

Self-Reflecti

In [24]:
# Define the state for our graph
class GraphState(TypedDict):
    question: str
    generation: str
    documents: List[Document]

def retrieve(state):
    """Retrieve documents."""
    print("---NODE: RETRIEVE---")
    question = state["question"]
    documents = retriever.invoke(question)
    return {"documents": documents, "question": question}

def generate(state):
    """Generate answer."""
    print("---NODE: GENERATE---")
    question = state["question"]
    documents = state["documents"]
    generation = rag_chain.invoke(question) # Using our previously defined chain
    return {"documents": documents, "question": question, "generation": generation}

def grade_documents(state):
    """Grade documents for relevance."""
    print("---NODE: GRADE DOCUMENTS---")
    # This would typically involve an LLM call to check if documents are relevant.
    # For simplicity, we'll assume they are relevant for now.
    return "generate" # Decision to proceed to generation

# Define the workflow
workflow = StateGraph(GraphState)
workflow.add_node("retrieve", retrieve)
workflow.add_node("generate", generate)

# Define the edges
workflow.set_entry_point("retrieve")
workflow.add_conditional_edges(
    "retrieve",
    grade_documents,
    {"generate": "generate", "end": END} # Simplified: always go to generate
)
workflow.add_edge("generate", END)

# Compile and run the graph
app = workflow.compile()
inputs = {"question": "What is the 'memory' component in an agent?"}
for output in app.stream(inputs):
    for key, value in output.items():
        print(f"Output from node '{key}':")
        print("---")
        print(value)
    print("\n---\n")

---NODE: RETRIEVE---
---NODE: GRADE DOCUMENTS---
Output from node 'retrieve':
---
{'documents': [Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps,

In [19]:
# Manually crafted evaluation dataset
# Ground truth document content must match text in the original source
EVAL_DATASET = [
    {
        "question": "What is the 'reflection' component in an LLM-powered agent?",
        "ground_truth_answer": "The 'reflection' component in an LLM-powered agent involves self-criticism and self-correction. It allows the agent to internally reflect on its past actions and outcomes, learn from its mistakes, and refine its plan for future steps. This mechanism is crucial for long-term reasoning and continuous improvement.",
        "ground_truth_docs_text": [
            "The LLM acts as the agent’s reasoning engine, which can be further augmented by: a) a planning module for task decomposition and planning, b) a memory stream for storing and retrieving past experiences, and c) a reflection mechanism for self-criticism and self-improvement.",
            "Reflection is a meta-level process that allows an agent to learn from its past experience and refine its plan for future actions. Reflection is a form of self-criticism and self-correction. It helps agents to not only avoid repeating past mistakes but also to improve their reasoning and planning skills over time."
        ]
    },
    {
        "question": "Explain the key components of a generic agent system.",
        "ground_truth_answer": "A generic agent system consists of three main components: Planning (for task decomposition), Memory (for storing past experiences), and Tool Use (for interacting with the external world to perform tasks).",
        "ground_truth_docs_text": [
            "The agent uses the LLM as its core reasoning engine. The three key components of the generic agent system are: Planning (for task decomposition and strategic thinking), Memory (for storing and retrieving information), and Tool Use (for interacting with external environments)."
        ]
    }
]

In [25]:
def calculate_retrieval_metrics(retriever, documents_chunks, eval_dataset):
    print("--- Calculating Retrieval Metrics ---")
    retrieval_results = []
    k = CONFIG["retriever"]["k"]

    # Map ground truth text to document indices for NDCG calculation
    text_to_doc_idx = {doc.page_content: i for i, doc in enumerate(documents_chunks)}

    for item in eval_dataset:
        question = item["question"]
        ground_truth_docs_text = item["ground_truth_docs_text"]

        # 1. Get retrieved documents
        retrieved_docs = retriever.invoke(question)
        retrieved_texts = [doc.page_content for doc in retrieved_docs]

        # 2. Determine relevance
        # A retrieved doc is "relevant" if its text is in our ground truth list
        relevance_list = [1 if text in ground_truth_docs_text else 0 for text in retrieved_texts]

        # 3. Calculate Precision@k
        precision_at_k = sum(relevance_list) / k if k > 0 else 0

        # 4. Calculate Recall@k
        num_relevant_docs = len(ground_truth_docs_text)
        recall_at_k = sum(relevance_list) / num_relevant_docs if num_relevant_docs > 0 else 0

        # 5. Calculate NDCG
        # DCG and NDCG are designed for graded relevance, but we can use binary relevance (1 or 0)
        # Reshape for sklearn
        true_relevance = np.asarray([relevance_list])
        ndcg_at_k = ndcg_score(true_relevance, true_relevance)

        retrieval_results.append({
            "question": question,
            f"Precision@{k}": precision_at_k,
            f"Recall@{k}": recall_at_k,
            "NDCG": ndcg_at_k
        })

    avg_precision = np.mean([res[f"Precision@{k}"] for res in retrieval_results])
    avg_recall = np.mean([res[f"Recall@{k}"] for res in retrieval_results])
    avg_ndcg = np.mean([res["NDCG"] for res in retrieval_results])

    print(f"Average Precision@{k}: {avg_precision:.4f}")
    print(f"Average Recall@{k}: {avg_recall:.4f}")
    print(f"Average NDCG: {avg_ndcg:.4f}")
    return retrieval_results