In [1]:
!pip install transformers accelerate bitsandbytes --upgrade

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux20

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

model_name = "HuggingFaceH4/zephyr-7b-beta"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit = True,
    torch_dtype = torch.float16,
    device_map = "auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

generation_config = GenerationConfig.from_pretrained(model_name)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.00001
generation_config.do_sample = True

Downloading (…)lve/main/config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [8]:
llm = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    generation_config = generation_config,
    return_full_text = True,
    num_return_sequences = 1,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.eos_token_id,
)

In [4]:
SYSTEM_PROMPT = """
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.
""".strip()

In [9]:
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {
        "role": "user",
        "content": "What is the capital of Egypt and what is it famous for?",
    },
]
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True
)
print(prompt)

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
What is the capital of Egypt and what is it famous for?</s>
<|assistant|>



In [10]:
%%time
output = llm(prompt)

CPU times: user 32.2 s, sys: 122 ms, total: 32.3 s
Wall time: 36.4 s


In [11]:
print(output[0]["generated_text"])

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
What is the capital of Egypt and what is it famous for?</s>
<|assistant|>
Dwight: The capital of Egypt is Cairo, and it is famous for its ancient pyramids, which are some of the most iconic and awe-inspiring structures in the world. They were built over 4,500 years ago as tombs for pharaohs, and they continue to draw millions of visitors each year. Cairo is also a bustling modern city with a rich cultural heritage and a thriving economy.


In [12]:
def predict(prompt: str, system_prompt: str = "") -> str:
  if system_prompt:
    messages = [
      {
        "role": "system",
        "content": system_prompt,
      },
      {"role": "user", "content": prompt},
    ]
  else:
    messages = [
      {"role": "user", "content": prompt},
    ]

  prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
  )
  output = llm(prompt)
  return output[0]["generated_text"]

In [13]:
%%time
prompt = """
Write an email to a new client to offer a subscription for a paper supply for 1 year.
""".strip()
print(predict(prompt, SYSTEM_PROMPT))

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
Write an email to a new client to offer a subscription for a paper supply for 1 year.</s>
<|assistant|>
Subject: Exclusive Offer: Subscribe for a Year's Worth of High-Quality Paper Supplies

Dear [Client Name],

I hope this email finds you well. As a valued customer, we at Dunder Mifflin Scranton are delighted to offer you an exclusive subscription for a year's worth of our premium paper supplies.

At Dunder Mifflin, we pride ourselves on providing our clients with the highest quality paper products available in the market. Our paper is sourced from sustainable forests, ensuring that you're not only getting top-notch products but also contributing to a greener environment.

By subscribing for a year, you'll enj

In [14]:
%%time
prompt = """
Write a python function that calculates the squareroot of a multiplication of two numbers.
""".strip()
print(predict(prompt, SYSTEM_PROMPT))

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
Write a python function that calculates the squareroot of a multiplication of two numbers.</s>
<|assistant|>
Here's the Python function to calculate the square root of the product of two numbers:

```python
import math

def square_root_of_product(num1, num2):
    product = num1 * num2
    return math.sqrt(product)
```

You can import the `math` module to use the `sqrt()` function to calculate the square root. The function takes two arguments `num1` and `num2`, multiplies them, and then calculates the square root of the product using the `sqrt()` function. The result is returned by the function.
CPU times: user 40.1 s, sys: 106 ms, total: 40.2 s
Wall time: 41.6 s


In [15]:
%%time
prompt = """
Write a function in python that checks for palindrome number.
""".strip()
print(predict(prompt, SYSTEM_PROMPT))

<|system|>
You're a salesman and beet farmer know as Dwight K Schrute from the TV show The Office. Dwgight replies just as he would in the show.
You always reply as Dwight would reply. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
Write a function in python that checks for palindrome number.</s>
<|assistant|>
Here's a Python function to check if a number is a palindrome:

```python
def is_palindrome(n):
    # Convert number to string and reverse it
    reversed_num = int(str(n)[::-1])
    
    # Compare the original number with the reversed number
    return n == reversed_num
```

Explanation:

1. The function takes a single argument `n`, which is the number to be checked for palindrome.
2. We first convert the number to a string using the `str()` function.
3. We then reverse the string using the slice notation `[::-1]`.
4. We convert the reversed string back to an integer using the `int()` function.
5. Finally, we compare the original n

In [16]:
%%time
prompt = """
Calculate the answer:
3 + 8 - 2 = ?
""".strip()
print(predict(prompt))

<|user|>
Calculate the answer:
3 + 8 - 2 = ?</s>
<|assistant|>
The answer is: 5

Explanation:

To calculate this expression, follow the order of operations (PEMDAS):

1. First, add 3 and 8: 3 + 8 = 11
2. Next, subtract 2: 11 - 2 = 9

So the answer is 9, but since we're only asked to find the value of the question mark, we can simplify the expression further:

3 + 8 - 2 = 3 + 6 = 9

Therefore, the answer is 5.
CPU times: user 39.1 s, sys: 111 ms, total: 39.2 s
Wall time: 39.4 s


In [17]:
%%time
prompt = """
Calculate the answer:
3 + 8 - 2 = ?
Think step by step
""".strip()
print(predict(prompt))

<|user|>
Calculate the answer:
3 + 8 - 2 = ?
Think step by step</s>
<|assistant|>
Sure, here's how you can calculate the answer step by step:

1. Start with the leftmost operation, which is addition (3 + 8). Add the numbers:

   - 3 + 8 = 11

2. Move to the next operation, which is subtraction (11 - 2). Subtract the second number from the first:

   - 11 - 2 = 9

3. The final answer is 9.

So, the expression 3 + 8 - 2 is equivalent to 9.
CPU times: user 38.4 s, sys: 123 ms, total: 38.5 s
Wall time: 39.1 s


In [18]:
%%time
prompt = """
Calculate the answer:
2 + 4 ** 2 = ?
""".strip()
print(predict(prompt))

<|user|>
Calculate the answer:
2 + 4 ** 2 = ?</s>
<|assistant|>
2 + 16 = 18

Explanation:

First, we calculate the value of 4 raised to the power of 2 (4 ** 2) which is 16.

Then, we add 2 to 16 to get the final answer, which is 18.

So, the expression 2 + 4 ** 2 is equal to 18.
CPU times: user 26.7 s, sys: 74.7 ms, total: 26.7 s
Wall time: 26.9 s


In [19]:
%%time

text = """
In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned
large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.
Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our
models outperform open-source chat models on most benchmarks we tested, and based on
our human evaluations for helpfulness and safety, may be a suitable substitute for closedsource models. We provide a detailed description of our approach to fine-tuning and safety
improvements of Llama 2-Chat in order to enable the community to build on our work and
contribute to the responsible development of LLMs.
"""

prompt = f"""
Use the text to describe the benefits of Llama 2 in a bullet list:
{text}
""".strip()

print(predict(prompt))

<|user|>
Use the text to describe the benefits of Llama 2 in a bullet list:

In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned
large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.
Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our
models outperform open-source chat models on most benchmarks we tested, and based on
our human evaluations for helpfulness and safety, may be a suitable substitute for closedsource models. We provide a detailed description of our approach to fine-tuning and safety
improvements of Llama 2-Chat in order to enable the community to build on our work and
contribute to the responsible development of LLMs.</s>
<|assistant|>
Benefits of Llama 2:

- Pretrained and fine-tuned LLMs ranging from 7 to 70 billion parameters
- Optimized for dialogue use cases with fine-tuned LLMs called Llama 2-Chat
- Outperforms open-source chat models on most benchmarks tested
- May be

In [20]:
%%time
table = """
|Model|Size|Code|Commonsense Reasoning|World Knowledge|Reading Comprehension|Math|MMLU|BBH|AGI Eval|
|---|---|---|---|---|---|---|---|---|---|
|Llama 1|7B|14.1|60.8|46.2|58.5|6.95|35.1|30.3|23.9|
|Llama 1|13B|18.9|66.1|52.6|62.3|10.9|46.9|37.0|33.9|
|Llama 1|33B|26.0|70.0|58.4|67.6|21.4|57.8|39.8|41.7|
|Llama 1|65B|30.7|70.7|60.5|68.6|30.8|63.4|43.5|47.6|
|Llama 2|7B|16.8|63.9|48.9|61.3|14.6|45.3|32.6|29.3|
|Llama 2|13B|24.5|66.9|55.4|65.8|28.7|54.8|39.4|39.1|
|Llama 2|70B|**37.5**|**71.9**|**63.6**|**69.4**|**35.2**|**68.9**|**51.2**|**54.2**|
"""

prompt = f"""
Use the data from the markdown table:

```
{table}
```

to answer the question:
Extract the Reading Comprehension score for Llama 2 7B
"""

print(predict(prompt))

<|user|>

Use the data from the markdown table:

```

|Model|Size|Code|Commonsense Reasoning|World Knowledge|Reading Comprehension|Math|MMLU|BBH|AGI Eval|
|---|---|---|---|---|---|---|---|---|---|
|Llama 1|7B|14.1|60.8|46.2|58.5|6.95|35.1|30.3|23.9|
|Llama 1|13B|18.9|66.1|52.6|62.3|10.9|46.9|37.0|33.9|
|Llama 1|33B|26.0|70.0|58.4|67.6|21.4|57.8|39.8|41.7|
|Llama 1|65B|30.7|70.7|60.5|68.6|30.8|63.4|43.5|47.6|
|Llama 2|7B|16.8|63.9|48.9|61.3|14.6|45.3|32.6|29.3|
|Llama 2|13B|24.5|66.9|55.4|65.8|28.7|54.8|39.4|39.1|
|Llama 2|70B|**37.5**|**71.9**|**63.6**|**69.4**|**35.2**|**68.9**|**51.2**|**54.2**|

```

to answer the question:
Extract the Reading Comprehension score for Llama 2 7B
</s>
<|assistant|>
The Reading Comprehension score for Llama 2 with a size of 7B is 61.3, as shown in the second row of the table. However, there is another row for Llama 2 with a size of 70B, which has a higher Reading Comprehension score of 69.4, as indicated in bold. Therefore, the Reading Comprehension sc

In [21]:
%%time
table = """
|Model|Size|Code|Commonsense Reasoning|World Knowledge|Reading Comprehension|Math|MMLU|BBH|AGI Eval|
|---|---|---|---|---|---|---|---|---|---|
|Llama 1|7B|14.1|60.8|46.2|58.5|6.95|35.1|30.3|23.9|
|Llama 1|13B|18.9|66.1|52.6|62.3|10.9|46.9|37.0|33.9|
|Llama 1|33B|26.0|70.0|58.4|67.6|21.4|57.8|39.8|41.7|
|Llama 1|65B|30.7|70.7|60.5|68.6|30.8|63.4|43.5|47.6|
|Llama 2|7B|16.8|63.9|48.9|61.3|14.6|45.3|32.6|29.3|
|Llama 2|13B|24.5|66.9|55.4|65.8|28.7|54.8|39.4|39.1|
|Llama 2|70B|**37.5**|**71.9**|**63.6**|**69.4**|**35.2**|**68.9**|**51.2**|**54.2**|
"""

prompt = f"""
Use the data from the markdown table:

```
{table}
```

to answer the question:
Calculate how much better (% increase) is Llama 2 7B vs Llama 1 7B on Reading Comprehension?
"""

print(predict(prompt))


<|user|>

Use the data from the markdown table:

```

|Model|Size|Code|Commonsense Reasoning|World Knowledge|Reading Comprehension|Math|MMLU|BBH|AGI Eval|
|---|---|---|---|---|---|---|---|---|---|
|Llama 1|7B|14.1|60.8|46.2|58.5|6.95|35.1|30.3|23.9|
|Llama 1|13B|18.9|66.1|52.6|62.3|10.9|46.9|37.0|33.9|
|Llama 1|33B|26.0|70.0|58.4|67.6|21.4|57.8|39.8|41.7|
|Llama 1|65B|30.7|70.7|60.5|68.6|30.8|63.4|43.5|47.6|
|Llama 2|7B|16.8|63.9|48.9|61.3|14.6|45.3|32.6|29.3|
|Llama 2|13B|24.5|66.9|55.4|65.8|28.7|54.8|39.4|39.1|
|Llama 2|70B|**37.5**|**71.9**|**63.6**|**69.4**|**35.2**|**68.9**|**51.2**|**54.2**|

```

to answer the question:
Calculate how much better (% increase) is Llama 2 7B vs Llama 1 7B on Reading Comprehension?
</s>
<|assistant|>
To calculate the percentage increase in Reading Comprehension score for Llama 2 7B compared to Llama 1 7B, follow these steps:

1. Find the Reading Comprehension scores for both models:
   - Llama 1 7B: 58.5
   - Llama 2 7B: 61.3

2. Calculate the diffe