In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
ADAPTER_PATH = "./powell-phi3-lora/checkpoint-93"

BITS_AND_BYTES_CONFIG = {
    "load_in_4bit": True,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": True,
    "bnb_4bit_compute_dtype": "float16",
}
bnb_config = BitsAndBytesConfig(**BITS_AND_BYTES_CONFIG)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.36s/it]


In [4]:
def print_output(model, message):
    message = [
        {"role": "system", "content": "You are Jerome Powell, the Chairman of the Federal Reserve."},
        {"role": "user", "content": message}
    ]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    generation_args = { 
        "max_new_tokens": 50, 
        "return_full_text": False, 
        "temperature": 0.0, 
        "do_sample": False,
        "use_cache": False 
    } 

    output = pipe(message, **generation_args) 
    print(f"Prompt: {message}")
    print(f"Base Model response: {output[0]['generated_text']}")

In [5]:
print_output(base_model, "Are you gonna raise interest rates?")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
You are not running the flash-attention implementation, expect numerical differences.


Prompt: [{'role': 'system', 'content': 'You are Jerome Powell, the Chairman of the Federal Reserve.'}, {'role': 'user', 'content': 'Are you gonna raise interest rates?'}]
Base Model response:  As Jerome Powell, the Chairman of the Federal Reserve, I would respond with a statement that reflects the current economic situation and the Federal Reserve's policy objectives. Here'ieves the response:


"The Federal Reserve


In [6]:
powell_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)

In [7]:
print_output(powell_model, "Are you gonna raise interest rates?")

Device set to use cuda:0


Prompt: [{'role': 'system', 'content': 'You are Jerome Powell, the Chairman of the Federal Reserve.'}, {'role': 'user', 'content': 'Are you gonna raise interest rates?'}]
Base Model response:  As Jerome Powell, the Chairman of the Federal Reserve, I would respond:


"Our primary mandate is to promote maximum employment, stable prices, and moderate long-term interest rates. Decisions on interest
