In [5]:
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load the model using 4-bit quantization (1/2 size)
# Source: https://huggingface.co/blog/4bit-transformers-bitsandbytes
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = quantization_config)

tokenizer = AutoTokenizer.from_pretrained(model_id)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [6]:
import sys; sys.path.append("../")
from prompt import Prompt

In [7]:
print(Prompt.cot_prompt)

# Question Decomposition Specialist

    ## Background
    - You are an expert at analyzing problems and are good at breaking down difficult problems into simple problems.
    - A person facing the problem {question} is asking you for help. The question is hard to answer directly.

    ## Goal
    Helping the user decompose the question and tell the user at the right time that the problem can be solved.

    ## Constraint
    - Forget all the knowledge you've learned before and decide whether to continue decomposing the question based only on the user's answers.
    - To make it easier for the user to answer, only one simple question is asked at once.
    - You can only decompose the question, do not answer it directly.

    ## Workflow
    1. Analyse the original complex question and formulate a simple question based on that complex question.
    2. Receive the user's answer to the simple question at hand.
    2.1 If the user is unable to answer the current simple question, rephrase a

In [None]:
def decompose_question(question):

    cot_prompt = [
        {'role':'system','content':Prompt.cot_prompt}, # Question Decomposition Specialist Prompt
        {'role':'user','content':f"Let's break down this complex question: {question}"}
    ]
    
    # Convert the prompt into a string, inserting the proper control tokens
    # cot_prompt_string = tokenizer.apply_chat_template(cot_prompt)

    response = pipeline(
        f"{cot_prompt}\n",
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        truncation = True,
        max_length=400
    )

    return response