In [3]:
%%capture
!pip install langchain>=0.1.17 openai>=1.13.3 langchain_openai>=0.1.6 transformers>=4.40.1 datasets>=2.18.0 accelerate>=0.27.2 sentence-transformers>=2.5.1 duckduckgo-search>=5.2.2
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [6]:
# Load model and tokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct",
                             device_map="cuda",
                             torch_dtype="auto",
                             trust_remote_code=False)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

# Create a pipeline

pipeline = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    return_full_text=False,
                    max_new_tokens=500,
                    do_sample=False)

# Create a prompt

messages = [{"role": "user", "content": "Create a funny joke about chickens."}]

# Generate output

output = pipeline(messages)

# Print output

print(output[0]["generated_text"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Device set to use cuda
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 Why did the chicken join the band? Because it had the drumsticks!


In [8]:
# How to explicitly create a prompt template which is otherwise taken care of by the underlying tokenizer in transformers.pipeline

# Apply the prompt template
prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False)

# Print the prompt
print(prompt)


<|user|>
Create a funny joke about chickens.<|end|>
<|endoftext|>


### Controlling Model Output : temperature and top_p

In [9]:
# Using a high temperature
output = pipeline(messages, do_sample=True, temperature=1)
print(output[0]["generated_text"])

 Why did the chicken refuse to go to the farm? Because it heard the farm had too many fowl policies, and it wanted to buy its own chicken coop!


In [10]:
# Using a high top_p
output = pipeline(messages, do_sample=True, top_p=1)
print(output[0]["generated_text"])

 Why did the chicken join the dance class? Because it wanted to learn the latest “roost-to-stomp” moves!


In [11]:
# Prompt components

persona = ""
instruction=""
context=""
data_format=""
audience=""
tone=""
text=""
data=""

# The full prompt - remove and add pieces to view its impact on the generated output
query = persona + instruction + context + data_format + audience + tone + data

In [13]:
# Use a single example of using the made-up word in a sentence

one_shot_prompt = [
    {
        "role": "user",
        "content": "A 'Gigamuru' is a type of Japanese musical instrument. An example of a sentence that uses the word Gigamuru is:"
    },
    {
        "role": "assistant",
        "content": "I have a Gigamaru at home that my uncle gave me as a gift. I love to play it at home."
    },
    {
        "role": "user",
        "content": "To 'screeg' something is to swing a sword at it. An example of a sentence that uses the word screeg is:"
    }
]
print(tokenizer.apply_chat_template(one_shot_prompt, tokenize=False))

<|user|>
A 'Gigamuru' is a type of Japanese musical instrument. An example of a sentence that uses the word Gigamuru is:<|end|>
<|assistant|>
I have a Gigamaru at home that my uncle gave me as a gift. I love to play it at home.<|end|>
<|user|>
To 'screeg' something is to swing a sword at it. An example of a sentence that uses the word screeg is:<|end|>
<|endoftext|>


In [14]:
# Generate the output
outputs = pipeline(one_shot_prompt)
print(outputs[0]["generated_text"])

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 During the medieval reenactment, the knight skillfully screeged his opponent with a swift and precise motion, demonstrating the art of swordplay.


### Chain Prompting: Breaking up the Problem

In [15]:
# Create name and slogan for a product
product_prompt = [
    {
        "role": "user",
        "content": "Create name and slogan for the chatbot that leverages LLMs."
    }
]
outputs = pipeline(product_prompt)
product_description = outputs[0]["generated_text"]
print(product_description)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 Name: ChatWise
Slogan: "Your AI Companion for Smart Conversations"


In [17]:
# Based on name and slogan for a product, generate a sales pitch
sales_prompt = [
    {
        "role": "user",
        "content": f"Generate a very short sales pitch for the following product: {product_description}"
    }
]
outputs = pipeline(sales_prompt)
sales_pitch = outputs[0]["generated_text"]
print(sales_pitch)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 Introducing ChatWise, your AI Companion for Smart Conversations! With ChatWise, you'll have a personalized assistant that understands your needs and helps you navigate through any conversation. Whether you're looking for information, advice, or just someone to talk to, ChatWise is here to make your life easier. Try ChatWise today and experience the power of AI-driven communication!


### Chain-of-Thought : Prompting with a ***reasoning*** example

In [18]:
# Answering with chain-of-thought
cot_prompt = [
    {
        "role": "user",
        "content": "Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?"
    },
    {
        "role": "assistant",
        "content": "Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11."
    },
    {
        "role": "user",
        "content": "The cafeteria had 23 apples. If they used 20 for lunch and bought 6 more, how many apples do they have?"
    }
]
outputs = pipeline(cot_prompt)
cot_output = outputs[0]["generated_text"]
print(cot_output)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 The cafeteria started with 23 apples. They used 20 for lunch, so they had 23 - 20 = 3 apples left. Then they bought 6 more apples, so they now have 3 + 6 = 9 apples. The answer is 9.


In [19]:
# Zero-shot chain-of-thought - Prompting without an example (Use the phrase : let's think step-by-step)

zero_cot_prompt = [
    {
        "role": "user",
        "content": "The cafeteria had 23 apples. If they used 20 for lunch and bought 6 more, how many apples do they have? Let's think step-by-step."
    }
]
outputs = pipeline(zero_cot_prompt)
zero_cot_output = outputs[0]["generated_text"]
print(zero_cot_output)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 Step 1: Start with the initial number of apples in the cafeteria, which is 23.

Step 2: Subtract the number of apples used for lunch, which is 20.
23 - 20 = 3 apples remaining.

Step 3: Add the number of apples bought, which is 6.
3 + 6 = 9 apples.

So, the cafeteria now has 9 apples.


### Tree-of-Thought: Exploring intermediate steps

In [20]:
# Zero-shot tree-of-thought
zeroshot_tot_prompt = [
    {
        "role": "user",
        "content": "Imagine three different experts are answering this question. All experts will write down 1 step of their thinking, then share it with the group. Then all experts will go on to the next step, etc. If any experts realize that they are wrong at any point then they leave. The question is: 'The cafeteria had 23 apples. If they used 20 for lunch and bought 6 more, how many apples do they have?' Make sure to discuss the results."
    }
]

# Generate the output
outputs = pipeline(zeroshot_tot_prompt)
zeroshot_tot_output = outputs[0]["generated_text"]
print(zeroshot_tot_output)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 Expert 1:
Step 1: Start with the initial number of apples, which is 23.

Expert 2:
Step 1: Subtract the number of apples used for lunch, which is 20, from the initial number of apples (23 - 20 = 3).
Step 2: Add the number of apples bought, which is 6, to the remaining apples (3 + 6 = 9).

Expert 3:
Step 1: Subtract the number of apples used for lunch, which is 20, from the initial number of apples (23 - 20 = 3).
Step 2: Add the number of apples bought, which is 6, to the remaining apples (3 + 6 = 9).

Discussion:
All three experts arrived at the same answer, which is 9 apples. This indicates that their calculations were correct. The cafeteria initially had 23 apples, used 20 for lunch, and then bought 6 more, resulting in a total of 9 apples.


### Verifying the generated output using Constrained Sampling

In [26]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Using cached llama_cpp_python-0.3.14.tar.gz (51.0 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.14-cp311-cp311-linux_x86_64.whl size=4299375 sha256=fb2ab2fa42915df83cb12ac6671b7f9a1a05fb1e1ff2db6cd2021ddd1de78bba
  Stored in directory: /root/.cache/pip/wheels/3f/b6/cf/7315ec7b0149210d2d4447d9c3338b36d10e56a1ecddcd35c0
Successfully built llama-cpp-python
Installing collected packages: diskcache, llama-cpp-python

In [29]:
import gc
import torch


gc.collect()
torch.cuda.empty_cache()

In [34]:
from llama_cpp.llama import Llama

# Load Phi-3
llm = Llama.from_pretrained(
    repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
    filename="*fp16.gguf",
    n_gpu_layers=-1,
    n_ctx=2048,
    verbose=False,
)

./Phi-3-mini-4k-instruct-fp16.gguf:   0%|          | 0.00/7.64G [00:00<?, ?B/s]

llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


In [35]:
# Generate output
output = llm.create_chat_completion(messages=[
    {
        "role": "user", "content": "Create a warrior for an RPG in JSON format."
    }],
    response_format={"type": "json_object"},
    temperature=0,
)["choices"][0]["message"]["content"]

KeyboardInterrupt: 

In [None]:
import json

# Check if the output is actually JSON
json_output = json.dumps(json.loads(output), indent=4)
print(json_output)