In [1]:
import os
import wandb
from pprint import pprint
from getpass import getpass
from rich.markdown import Markdown
import wandb

import torch
import transformers

# Set llama2 API key 

To get key, click on [link](https://huggingface.co/settings/tokens).

In [2]:
# Set LLAMA2 API key environment variable
if os.getenv("LLAMA2_API_KEY") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["LLAMA2_API_KEY"] = getpass("Paste your LLAM2 key from your huggingface settings \n")

assert os.getenv("LLAMA2_API_KEY", "").startswith("hf_"), "This doesn't look like a valid HuggingFace llama2 key"
print("Llama2 API key configured")

# Get the HF auth token
hf_auth = os.getenv("LLAMA2_API_KEY", "")

Please enter password in the VS Code prompt at the top of your VS Code window!
Llama2 API key configured


# Tokenisation

The Llama 2 7B models were trained using the Llama 2 7B tokenizer - used to tokenise text and decode tokenised text

In [3]:
# Define llama2 model to load
model_id = 'meta-llama/Llama-2-7b-chat-hf'

In [4]:
# Example of tokanisation (encode and decode)
# - here we download the tokeniser for the llama2 model
# - different models may need a different tokeniser
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
    )
enc = tokenizer.encode("Weights & Biases is awesome!")
print(enc)
print(tokenizer.decode(enc))

[1, 1334, 5861, 669, 3457, 2129, 338, 29663, 29991]
<s> Weights & Biases is awesome!




In [5]:
# Decode tokens one by one
# - most natural way of splitting words into tokens would be to use the spaces
# - however, to contain the size of the vocabulary, some words split into sub-words (units)
for token_id in enc:
    print(f"{token_id}\t{tokenizer.decode([token_id])}")

1	<s>
1334	We
5861	ights
669	&
3457	Bi
2129	ases
338	is
29663	awesome
29991	!


# Load LLM model

Read more about 
* [transformers.pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines)
* [config inputs](https://huggingface.co/docs/transformers/main_classes/text_generation)
* [llama2 usage - ref1](https://huggingface.co/blog/llama2)
* [llama2 usage - ref2](https://www.pinecone.io/learn/llama-2/#Building-a-Llama-2-Conversational-Agent)

In [6]:
# Set quantization configuration to load large model with less GPU memory
# - this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# This configuration object uses the model configuration from Hugging Face 
# to set different model parameters
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

# Download and initialize the model 
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


# Sampling

### Sampling with Temperature

Let's sample some text from the model by passing the **temperature** parameter. 

***Option 1 - a) instanciate transformers.pipeline and b) run inference by passing config inputs and prompt to object***

In [7]:
pipeline_generate_text = transformers.pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
)

In [8]:
# Create a function that passes temperature parameter to the llama2 api
def generate_with_temperature(pipeline_generate_text, temp):
    "Generate text with a given temperature -> higher temperature means more randomness (0 - 1)"
    response = pipeline_generate_text(
        'Describe the company Weights & Biases.\n',
        do_sample=True,           # Whether or not to use sampling 
        top_k=50,
        num_return_sequences=1,   # number of independently computed returned sequences for each element in the batch.
        temperature=temp,         # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        max_new_tokens=50,
        # repetition_penalty=1.1, # without this output begins repeating
        )
    return response[0]["generated_text"]

In [39]:
# Generate text with different temperature values
# - as temperature increased > 1, the generated text becomes too gibberish (cause more low probability tokens get sampled)
for temp in [0.01, 0.5, 1.0, 1.5, 2.0]:
    pprint(f'TEMP: {temp}, GENERATION: {generate_with_temperature(pipeline_generate_text, temp)}')
    print('------')



('TEMP: 0.01, GENERATION: Describe the company Weights & Biases.\n'
 '\n'
 'Weights & Biases is a startup that aims to help developers and data '
 'scientists make better decisions by providing them with a platform to '
 'collect, organize, and share their data and models. The company was founded '
 'in 20')
------
('TEMP: 0.5, GENERATION: Describe the company Weights & Biases.\n'
 '\n'
 'Weights & Biases is a software company that develops tools for machine '
 'learning engineers to measure and manage the risks associated with their '
 "models. The company's platform provides a suite of tools for model risk "
 'management, including model performance metrics')
------
('TEMP: 1.0, GENERATION: Describe the company Weights & Biases.\n'
 '\n'
 'Weighted means Biases, formerly known as Weights & Biases, is a software '
 'organization that creates equipment and administrations to help engineers '
 'and researcher tune and test AI models. The organization was established in '
 '201')
------


***Option 2 - a) instanciate transformers.pipeline and pass config inputs and b) run inference by passing prompt to object***

In [24]:
"""
# Create a function that passes temperature parameter to the llama2 api
def generate_with_temperature(model, tokenizer, temp):
    "Generate text with a given temperature -> higher temperature means more randomness (0 - 1)"
    # print(temp)
    generate_text = transformers.pipeline(
        task='text-generation',
        model=model,
        tokenizer=tokenizer,
        do_sample=True,           # Whether or not to use sampling 
        repetition_penalty=1.1, # without this output begins repeating
        # return_full_text=True,  # langchain expects the full text
        # we pass model parameters here too
        temperature=temp,         # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        max_new_tokens=50,        # max number of tokens to generate in the output (eg. 512)
    )
    response = generate_text("Describe the company Weights & Biases.")
    # response = generate_text("Explain to me the difference between nuclear fission and fusion.")
    return response[0]["generated_text"]
"""

***Option 3 - a) tokenise prompt, b) pass that along with config inputs to model.generate, c) decode answer***

In [35]:
"""# Create a function that passes temperature parameter to the llama2 api
def generate_with_temperature(model, tokenizer, temp):
    "Generate text with a given temperature -> higher temperature means more randomness (0 - 1)"

    prompt = "Describe the company Weights & Biases.\n"
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    # Generate
    print('------')
    generate_ids = model.generate(
        inputs.input_ids, 
        do_sample=True,           # Whether or not to use sampling 
        repetition_penalty=1.1,
        temperature=temp,
        max_new_tokens=50,
        )
    return tokenizer.decode(generate_ids[0], skip_special_tokens=True)
    # return tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"""

In [37]:
# # Generate text with different temperature values
# # - as temperature increased > 1, the generated text becomes too gibberish (cause more low probability tokens get sampled)
# for temp in [0.01, 0.5, 1.0, 1.5, 2.0]:
#   pprint(f'TEMP: {temp}, GENERATION: {generate_with_temperature(model, tokenizer, temp)}')

### Top p sampling

Can also use the [`top_p` parameter](https://platform.openai.com/docs/api-reference/completions/create#completions/create-top_p) to control the diversity of the generated text. 
- This parameter controls the cumulative probability of the next token. 
- For example, if `top_p=0.9`, the model will pick the next token from the top 90% most likely tokens. 
- The higher the `top_p` the more likely the model will pick a token that it hasn't seen before. 
- Decreasing `top_p` results in higher probability text being generated.
- You should only use one of `temperature` or `top_p` at a given time.

In [9]:
# Create a function that passes top_p parameter to the llama2 api
def generate_with_topp(pipeline_generate_text, topp):
    "Generate text with a given temperature -> higher temperature means more randomness (0 - 1)"
    response = pipeline_generate_text(
        'Describe the company Weights & Biases.\n',
        do_sample=True,           # Whether or not to use sampling 
        top_k=50,
        num_return_sequences=1,   # number of independently computed returned sequences for each element in the batch.
        top_p=topp,         # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        max_new_tokens=50,
        # repetition_penalty=1.1, # without this output begins repeating
        )
    return response[0]["generated_text"]

In [42]:
for topp in [0.01, 0.1, 0.5, 1.0]:
  pprint(f'TOP_P: {topp}, GENERATION: {generate_with_topp(pipeline_generate_text, topp)}')
  print('------')

('TOP_P: 0.01, GENERATION: Describe the company Weights & Biases.\n'
 '\n'
 'Weights & Biases is a startup that aims to help developers and data '
 'scientists make better decisions by providing them with a platform to '
 'collect, organize, and share their data and models. The company was founded '
 'in 20')
------
('TOP_P: 0.1, GENERATION: Describe the company Weights & Biases.\n'
 '\n'
 'Weights & Biases is a startup that aims to help developers and data '
 'scientists make better decisions by providing them with a platform to '
 'collect, organize, and share their data and models. The company was founded '
 'in 20')
------
('TOP_P: 0.5, GENERATION: Describe the company Weights & Biases.\n'
 '\n'
 'Weights & Biases is a software company that specializes in developing '
 'machine learning tools and platforms. The company was founded in 2017 by a '
 'team of experienced machine learning practitioners and researchers who '
 'recognized the need for better')
------
('TOP_P: 1.0, GENERAT

# How to prompt Llama2

For llama2, the prompt template for the first turn looks like the example below.
- Messages come with different roles - system prompt and user message
- We can use any `system_prompt` we want,
  - But it's crucial that the format matches the one used during training (see below). 
  - The instructions between the special <<SYS>> tokens provide context for the model so it knows how we expect it to respond (adhere to a certain behaviour).
  - The `user_message` is actually sent to the language model when the user enters some text

------------------------------
```
<s>[INST] <<SYS>>
{{ system_prompt }}
<</SYS>>

{{ user_message }} [/INST]
```

------------------------------

See [Lama2 ref](https://huggingface.co/blog/llama2)

In [10]:
# system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

# If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
# """
# user_message = """There's a llama in my garden. What should I do?"""

system_prompt = """You are a helpful assistant."""
user_message = """Describe the company Weights & Biases."""

In [11]:
# special tokens used by llama 2 chat
B_INST = "[INST] "
E_INST = "[/INST]"
B_SYS  = "<<SYS>>\n"
E_SYS  = "\n<</SYS>>\n\n"

# create the system message
sys_msg = f"<s>{B_INST}{B_SYS}{system_prompt}{E_SYS}"
# Create user message
user_msg = f"{user_message}{E_INST}"

prompt = sys_msg + user_msg
pprint(prompt)

('<s>[INST] <<SYS>>\n'
 'You are a helpful assistant.\n'
 '<</SYS>>\n'
 '\n'
 'Describe the company Weights & Biases.[/INST]')


In [12]:
# Create a function that passes top_p parameter to the llama2 api
def generate_with_prompt(pipeline_generate_text, prompt):
    "Generate text with a given temperature -> higher temperature means more randomness (0 - 1)"
    response = pipeline_generate_text(
        prompt,
        do_sample=True,           # Whether or not to use sampling 
        top_k=50,
        num_return_sequences=1,   # number of independently computed returned sequences for each element in the batch.
        temperature=0.01,         # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        max_new_tokens=200,
        repetition_penalty=1.1, # without this output begins repeating
        )
    return response[0]["generated_text"]

In [12]:
answer = generate_with_prompt(pipeline_generate_text, prompt)
pprint(answer)

('<s>[INST] <<SYS>>\n'
 'You are a helpful assistant.\n'
 '<</SYS>>\n'
 '\n'
 'Describe the company Weights & Biases.[/INST]  Of course! Weights & Biases '
 'is a cutting-edge technology company that specializes in developing '
 'innovative machine learning tools and platforms. The company was founded in '
 '2017 by a team of experienced machine learning researchers and engineers who '
 'were passionate about creating practical solutions for the industry.\n'
 "Weights & Biases' mission is to empower data scientists and machine learning "
 'practitioners with the most advanced tools and techniques to build better '
 'models, faster. They believe that by providing a platform that simplifies '
 'the development, deployment, and management of machine learning models, they '
 'can help organizations of all sizes unlock their full potential.\n'
 'The company\'s flagship product is an AI-powered platform called "W&B," '
 'which provides a wide range of features such as:\n'
 '* Automated model

Finish wandb run (see [Promps docs](https://docs.wandb.ai/guides/prompts))

# W&B Trace with LangChain Agent

Check [W&B Prompt tracing](https://docs.wandb.ai/guides/prompts/quickstart)

In [13]:
# Option 1 -----------------------------------------------------------------------------------------
# Turn on automated W&B logging with LangChain
# - Now any call to a LangChain LLM, Chain, Tool or Agent will be logged to Weights & Biases.
"""
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
"""

# Option 2 -----------------------------------------------------------------------------------------
# Alternatively, you might prefer to use a context manager to decide which LLM runs to log to W&B
# - in which case, run the following
from langchain.callbacks import wandb_tracing_enabled

if "LANGCHAIN_WANDB_TRACING" in os.environ:
    del os.environ["LANGCHAIN_WANDB_TRACING"]
# Then, to log an LLM run, we should write the following code (math_agent is in the context of the code below)
"""
with wandb_tracing_enabled():
    math_agent.run("What is 5 raised to .123243 power?")
"""

# Set parameters that are typically passed to wandb.init()
run = wandb.init(project="llmapps", job_type="generation")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33md-oliver-cort[0m ([33mdoc93[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:

generate_text = transformers.pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,    # langchain expects the full text
    # we pass model parameters here too
    do_sample=True,           # Whether or not to use sampling 
    temperature=0.5,          # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=50,        # max number of tokens to generate in the output (eg. 512)
    repetition_penalty=1.1,   # without this output begins repeating
)
# response = generate_text("Describe the company Weights & Biases.")

We can load the initialised Hugging Face pipeline into LangChain

In [19]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

Using `llm` we can now use Llama 2 with `LangChain`'s advanced tooling such as `agents`, `chains`, and `callbacks`. 

## Create a standard math Agent using LangChain:

In [20]:
from langchain.agents import load_tools, initialize_agent, AgentType

tools = load_tools(["llm-math"], llm=llm)
math_agent = initialize_agent(tools, 
                              llm, 
                              agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION)

In [21]:
# some sample maths questions
questions = [
  "Find the square root of 5.4.",
  "What is 3 divided by 7.34 raised to the power of pi?",
  "What is the sin of 0.47 radians, divided by the cube root of 27?"
]

# Log LLM results into W&B
with wandb_tracing_enabled():
  
  for question in questions:
    try:
      # call your Agent as normal
      answer = math_agent.run(question)
      print(answer)
      print('----')
    except Exception as e:
      # any errors will be also logged to Weights & Biases
      print(e)
      print('eeeee')
      pass

The square root of 5.4 is 2.169.
----
Could not parse LLM output: ` Hmm, looks like we can't use the calculator for this problem. Let's try using the formula instead.
Action: Write down the formula for raising a number to the power of pi, which is "a^π = a`
eeeee
-0.695

Please provide more details or clarify your question.
----


In [28]:
wandb.finish()



VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Unfinished code

## Create a conversational agent using LangChain

We need a few things, like conversational `memory`, access to `tools`, and an `llm` (already initialized above).

In [9]:
from langchain.memory import ConversationBufferWindowMemory
from langchain.agents import load_tools

memory = ConversationBufferWindowMemory(
    memory_key="chat_history", k=5, return_messages=True, output_key="output"
)
tools = load_tools(["llm-math"], llm=llm)

We initialize our conversational agent

In [11]:
from langchain.agents import initialize_agent

# initialize agent
agent = initialize_agent(
    agent="chat-conversational-react-description",
    tools=tools,
    llm=llm,
    verbose=True,
    early_stopping_method="generate",
    memory=memory
)

Create the `system message`

In [26]:
# special tokens used by llama 2 chat
B_INST = "[INST] "
E_INST = "[/INST]"
B_SYS  = "<<SYS>>\n"
E_SYS  = "\n<</SYS>>\n\n"

# create the system message
sys_msg = "<s>" + B_SYS + """You are a helpful assistant.""" + E_SYS

After setting our new system message we can move on to the prompt template for `user messages`.

In [27]:
user_msg = B_INST + " Describe the company Weights & Biases. " + E_INST

In [29]:
pprint(sys_msg)
pprint(user_msg)

'<s><<SYS>>\nYou are a helpful assistant.\n<</SYS>>\n\n'
'[INST]  Describe the company Weights & Biases. [/INST]'


In [30]:
# Load system message to LangChain
new_prompt = agent.agent.create_prompt(
    system_message=sys_msg,
    tools=tools
    )
agent.agent.llm_chain.prompt = new_prompt

# Load yser message to LangChain
agent.agent.llm_chain.prompt.messages[2].prompt.template = user_msg

In [32]:
agent.run()

ValueError: `run` supported with either positional arguments or keyword arguments, but none were provided.

In [35]:
agent.agent.llm_chain.prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], output_parser=None, partial_variables={}, template='<s><<SYS>>\nYou are a helpful assistant.\n<</SYS>>\n\n', template_format='f-string', validate_template=True), additional_kwargs={}),
 MessagesPlaceholder(variable_name='chat_history'),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], output_parser=None, partial_variables={}, template='[INST]  Describe the company Weights & Biases. [/INST]', template_format='f-string', validate_template=True), additional_kwargs={}),
 MessagesPlaceholder(variable_name='agent_scratchpad')]