In [1]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())

100%|██████████| 45.9M/45.9M [00:02<00:00, 19.1MiB/s]


bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [4]:
question = "What are the approaches to Task Decomposition?"
docs = vectorstore.similarity_search(question)
len(docs)

4

In [6]:
docs[0]

Document(page_content='Task decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.', metadata={'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en', 'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log"})

In [7]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [13]:
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# model_path = "./models/thespis-13b-v0.3.Q5_K_M.gguf"
model_path = "./models/llama-2-13b-chat.Q5_K_M.gguf"

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ./models/llama-2-13b-chat.Q5_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q5_K     [  5120,  5120,     

In [14]:
llm("Simulate a rap battle between Stephen Colbert and John Oliver")



Stephen Colbert: Yo, what's up John? Long time no see.
John Oliver: Yeah man, it's been too long. But I heard you've been talkin smack about me on your show.
Stephen Colbert: Oh, you heard that, huh? Well, let me tell you somethin, my man. I may not be the king of late night, but I'm definitely the prince of parody.
John Oliver: Ha! You think a few impressions and some satire make you the prince of anything? Please, I've been doin this for years, and I've got the Emmys to prove it.
Stephen Colbert: Oh snap, you really gonna bring up that old stuff? I've won more awards than you can shake a stick at, my friend. And don't even get me started on my ratings.
John Oliver: Okay, okay, I see how it is. You're gonna rely on your fancy writing staff and your network's reputation to do all the heavy lifting. But let me tell you somethin, my man - I may not have as many writers or


llama_print_timings:        load time =  3500.44 ms
llama_print_timings:      sample time =   124.97 ms /   256 runs   (    0.49 ms per token,  2048.51 tokens per second)
llama_print_timings: prompt eval time =  3500.12 ms /    13 tokens (  269.24 ms per token,     3.71 tokens per second)
llama_print_timings:        eval time = 95833.37 ms /   255 runs   (  375.82 ms per token,     2.66 tokens per second)
llama_print_timings:       total time = 100185.97 ms


"\n\nStephen Colbert: Yo, what's up John? Long time no see.\nJohn Oliver: Yeah man, it's been too long. But I heard you've been talkin smack about me on your show.\nStephen Colbert: Oh, you heard that, huh? Well, let me tell you somethin, my man. I may not be the king of late night, but I'm definitely the prince of parody.\nJohn Oliver: Ha! You think a few impressions and some satire make you the prince of anything? Please, I've been doin this for years, and I've got the Emmys to prove it.\nStephen Colbert: Oh snap, you really gonna bring up that old stuff? I've won more awards than you can shake a stick at, my friend. And don't even get me started on my ratings.\nJohn Oliver: Okay, okay, I see how it is. You're gonna rely on your fancy writing staff and your network's reputation to do all the heavy lifting. But let me tell you somethin, my man - I may not have as many writers or"

In [15]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Prompt
prompt = PromptTemplate.from_template(
    "Summarize the main themes in these retrieved docs: {docs}"
)

# Chain
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Run
question = "What are the approaches to Task Decomposition?"
docs = vectorstore.similarity_search(question)
result = llm_chain(docs)

# Output
result["text"]

Llama.generate: prefix-match hit




The main themes in the retrieved documents are:

1. Task decomposition: The documents discuss various approaches to decomposing tasks, including using LLM with simple prompting, task-specific instructions, and human inputs.
2. Challenges in long-term planning and task decomposition: The documents highlight the challenges in planning over a lengthy history and effectively exploring the solution space, which can make LLMs less robust compared to humans who learn from trial and error.
3. Expert models executing specific tasks: The documents mention the importance of expert models executing on specific tasks and logging results.
4. Instruction and task execution: The documents discuss the need for clear instructions and the importance of task execution in achieving specific goals.
5. Judging the correctness of task results: The documents highlight the importance of judging the correctness of task results to ensure the effectiveness of the task decomposition approach.

Overall, the retrie


llama_print_timings:        load time =  3500.44 ms
llama_print_timings:      sample time =   208.05 ms /   233 runs   (    0.89 ms per token,  1119.91 tokens per second)
llama_print_timings: prompt eval time =  8631.59 ms /   942 tokens (    9.16 ms per token,   109.13 tokens per second)
llama_print_timings:        eval time = 83623.34 ms /   232 runs   (  360.45 ms per token,     2.77 tokens per second)
llama_print_timings:       total time = 93430.48 ms


'\n\nThe main themes in the retrieved documents are:\n\n1. Task decomposition: The documents discuss various approaches to decomposing tasks, including using LLM with simple prompting, task-specific instructions, and human inputs.\n2. Challenges in long-term planning and task decomposition: The documents highlight the challenges in planning over a lengthy history and effectively exploring the solution space, which can make LLMs less robust compared to humans who learn from trial and error.\n3. Expert models executing specific tasks: The documents mention the importance of expert models executing on specific tasks and logging results.\n4. Instruction and task execution: The documents discuss the need for clear instructions and the importance of task execution in achieving specific goals.\n5. Judging the correctness of task results: The documents highlight the importance of judging the correctness of task results to ensure the effectiveness of the task decomposition approach.\n\nOverall,

In [22]:
from langchain import hub
rag_prompt = hub.pull("rlm/rag-prompt-llama")

from langchain.chains.question_answering import load_qa_chain

qa_chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=rag_prompt)

qa_chain({"input_documents": docs, "question": question}, return_only_outputs=True)

Llama.generate: prefix-match hit


  Sure, I'd be happy to help! Here are three approaches to task decomposition:

1. LLM with simple prompting: This approach involves using a large language model (LLM) with basic prompts like "Steps for XYZ" or "What are the subgoals for achieving XYZ?" to decompose a task into smaller subtasks.
2. Task-specific instructions: This approach involves providing specific instructions for each task, such as "Write a story outline" for writing a novel, to guide the decomposition of the task.
3. Human inputs: This approach involves using human input to guide the decomposition of a task, either by providing explicit guidance or by allowing the human to correct the LLM's output.

I hope that helps! Do you have any further questions?


llama_print_timings:        load time =  3500.44 ms


{'output_text': '  Sure, I\'d be happy to help! Here are three approaches to task decomposition:\n\n1. LLM with simple prompting: This approach involves using a large language model (LLM) with basic prompts like "Steps for XYZ" or "What are the subgoals for achieving XYZ?" to decompose a task into smaller subtasks.\n2. Task-specific instructions: This approach involves providing specific instructions for each task, such as "Write a story outline" for writing a novel, to guide the decomposition of the task.\n3. Human inputs: This approach involves using human input to guide the decomposition of a task, either by providing explicit guidance or by allowing the human to correct the LLM\'s output.\n\nI hope that helps! Do you have any further questions?'}

llama_print_timings:      sample time =   143.70 ms /   173 runs   (    0.83 ms per token,  1203.93 tokens per second)
llama_print_timings: prompt eval time =  2641.18 ms /   258 tokens (   10.24 ms per token,    97.68 tokens per second)
llama_print_timings:        eval time = 57734.43 ms /   172 runs   (  335.67 ms per token,     2.98 tokens per second)
llama_print_timings:       total time = 61183.35 ms
