Evaluations with zero-shot, 5-shot, full fine-tune, merged LoRA, merged DoRA.

In [1]:
import torch
from pathlib import Path
import os, json
from safetensors.torch import save_file
import copy
from tqdm import tqdm
import safetensors
import safetensors.torch
from glob import glob
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
from hqq.core.quantize import HQQLinear, HQQBackend, BaseQuantizeConfig, Quantizer

from hqq.core.quantize import HQQLinear, BaseQuantizeConfig, Quantizer
from hqq.backends.torchao import patch_hqq_to_aoint4
from fastcore.script import *

from accelerate import init_empty_weights


In [2]:
# with init_empty_weights():
# 	model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

In [3]:
# def print_modules(model, prefix="model"):
#     for name, module in model.named_children():
#         if len(list(module.children())) > 0:
#             print_modules(module, prefix + "." + name)
#         print(prefix + "." + name)

In [4]:
# "18,19,20,26,27,28".split(",")

In [5]:
# print_modules(model)

### Evaluate

In [6]:
import re
from datasets import load_dataset
from fastcore.parallel import parallel
from vllm import LLM, SamplingParams

In [7]:
def extract_last_number_or_ratio(s):
    # Find all sequences of digits, possibly with leading currency symbols, decimal points, and ratios
    patterns = re.findall(r'[\$€£]?\d+(?:\.\d+)?(?:\:\d+(?:\.\d+)?)?', s)
    
    # Return the last pattern found, or None if there are no matches
    if patterns:
        return patterns[-1]
    else:
        return None

In [8]:
def exact_match_score(preds, labels):
    return sum(p==g for p,g in zip(preds, labels))/len(preds)

In [9]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k")['train'].shuffle(seed=42)
dataset = dataset.select(range(len(dataset)-5000,len(dataset)))
short_answers_gt = parallel(extract_last_number_or_ratio, dataset['answer'], progress=True)

In [10]:
valid_dataset = dataset.select(range(500))

In [11]:
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]

In [12]:
labels = short_answers_gt[:500]

In [13]:
len(inputs), len(labels)

(500, 500)

In [14]:
NUM_GPUS = torch.cuda.device_count(); NUM_GPUS

1

In [15]:
TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [16]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [17]:
def convert_to_chat_input(question):
    messages = [
        {"role": "system", "content": "You are an AI assistant that excels in solving math problems."},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [18]:
chat_inputs = [convert_to_chat_input(question) for question in valid_dataset['question']]

In [19]:
len(chat_inputs)

500

#### FINETUNED 

In [20]:
model_dir = "/workspace/models/"

In [21]:
# TODO: Don't cast to float16 for bitblas

In [22]:
# NOTE: 4/2 bit models become very repetitive

# 4bit HQQ (skip_dora_all)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_250/vllm_tinygemm" # 0.578

# 4bit HQQ+DORA
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_250/vllm_tinygemm" # 0.593
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_1000/vllm_tinygemm" # 0.610

# 4/2bit HQQ+DORA
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_250/vllm_bitblas" # 0.290
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_1000/vllm_bitblas" # 0.380

# 4/2bit HQQ+DORA (merged)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_250/merged" # 0.242
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_1000/merged" # 0.350

# 4bit HQQ+DORA+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit-ln/step_250/vllm_tinygemm" # 0.562
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit-ln/step_1000/vllm_tinygemm" # 0.585

# 4bit HQQ+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4bit-ln/step_250/vllm_tinygemm" # 0.605
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4bit-ln/step_1000/vllm_tinygemm" # 0.55

# 4/2bit HQQ+DORA+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_250/vllm_bitblas" # 0.270
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/vllm_bitblas" # 0.42

# 4/2bit HQQ+DORA+LN (merged)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_250/merged" # 0.270
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/merged" # 0.343

# 4(HQQ)bit 2(HQQ+DORA)bit
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit/step_250/vllm_bitblas" # 0.257
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit/step_1000/vllm_bitblas" # 0.330

# 4(HQQ)bit 2(HQQ+DORA)bit+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit-ln/step_250/vllm_bitblas" # 0.265
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit-ln/step_1000/vllm_bitblas" # 0.375

# 4(HQQ)bit 2(HQQ)bit+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-hqq-2bit-ln/step_250/vllm_bitblas" # 0.02
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-hqq-2bit-ln/step_1000/vllm_bitblas" # 0.03


# Model generations become more repetitive with more training steps at temp=0.0.
# Frequency penalty helps with this, but needs more investigation.

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.26

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.24
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.31

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.29

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.30
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.35

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.26
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.35

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.36

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.20
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.32

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.30

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.23
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.34



# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.29

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # -
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.27

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.0
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.0

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.29
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.45

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.37

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.26
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.27

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.28
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.32

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.30

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.30



# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.26

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.19
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.07

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.02
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.07

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.28
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.32

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.41

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.21
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.20

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.31

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.23
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.36

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.26
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.31



# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_125/vllm_bitblas" # 0.41
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_250/vllm_bitblas" # 0.43
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_375/vllm_bitblas" # 0.38
MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_500/vllm_bitblas" # 0.46


# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_250/vllm_bitblas" # 0.32
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_375/vllm_bitblas" # 0.32
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_500/vllm_bitblas" # 0.33


# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_125/vllm_bitblas" # 0.28
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_250/vllm_bitblas" # 0.39
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_375/vllm_bitblas" # 0.41
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_500/vllm_bitblas" # 0.414


In [23]:
llm = LLM(model=os.path.join(model_dir,MODEL_NAME), 
          tokenizer=TOKENIZER, 
          tensor_parallel_size=NUM_GPUS, 
          max_model_len=8192,
          quantization="bitblas",
          dtype="float16",
		  gpu_memory_utilization=0.8,
          )

INFO 08-26 14:09:22 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/workspace/models/llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_500/vllm_bitblas', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitblas, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/workspace/models/llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-bas

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


Loading model.embed_tokens.weight
Loaded model.embed_tokens.weight
Loading model.layers.0.input_layernorm.weight
Loaded model.layers.0.input_layernorm.weight
Loading model.layers.0.mlp.down_proj.lora_A
Loaded model.layers.0.mlp.down_proj.lora_A
Loading model.layers.0.mlp.down_proj.lora_B
Loaded model.layers.0.mlp.down_proj.lora_B
Loading model.layers.0.mlp.down_proj.qweight
Loaded model.layers.0.mlp.down_proj.qweight
Loading model.layers.0.mlp.down_proj.rescale
Loaded model.layers.0.mlp.down_proj.rescale
Loading model.layers.0.mlp.down_proj.scales
Loaded model.layers.0.mlp.down_proj.scales
Loading model.layers.0.mlp.down_proj.zeros
Loaded model.layers.0.mlp.down_proj.zeros
Loading model.layers.0.mlp.gate_proj.lora_A
Loaded model.layers.0.mlp.gate_up_proj.lora_A
Loading model.layers.0.mlp.gate_proj.lora_B
Loaded model.layers.0.mlp.gate_up_proj.lora_B
Loading model.layers.0.mlp.gate_proj.qweight
Loaded model.layers.0.mlp.gate_up_proj.qweight
Loading model.layers.0.mlp.gate_proj.rescale
L

In [24]:
# base model
# outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))

In [25]:
# chat model
outputs = llm.generate(chat_inputs[:128], SamplingParams(temperature=0.0, max_tokens=1024, stop=["<|eot_id|>"]))

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 128/128 [00:26<00:00,  4.86it/s, est. speed input: 526.13 toks/s, output: 1337.20 toks/s]


In [26]:
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]

In [27]:
exact_match_score(short_answers_pred, labels)

0.4140625

In [30]:
chat_input = [convert_to_chat_input("Write python code for image classification using HF transformers library. Start with `from transformers import ...`")]
outputs = llm.generate(chat_input, SamplingParams(temperature=0.0, max_tokens=512, stop=["<|eot_id|>"],
                                                  frequency_penalty=0.3, presence_penalty=0.5))
print(outputs[0].outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.67s/it, est. speed input: 14.58 toks/s, output: 109.74 toks/s]

Here's an example of how you can use the HF Transformers library to perform image classification:

```python
from transformers import HuggingFaceTransformers

# Load pre-trained model
model = HuggingFaceTransformers.from_pretrained('bert-base-uncased')

# Load dataset
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Load images and labels
train_data = datasets.ImageNet('data/imagenet', split='train')
test_data = datasets.ImageNet('data/imagenet', split='val')

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train model for 10 epochs
for epoch in range(10):
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(images)




In [29]:
chat_input = [convert_to_chat_input("List top 10 scientist from 15th century as numbered list with their dob and death. Inclde da vinci.")]
outputs = llm.generate(chat_input, SamplingParams(temperature=0.0, max_tokens=256, stop=["<|eot_id|>"],
                                                  frequency_penalty=0.5, presence_penalty=0.))
print(outputs[0].outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it, est. speed input: 33.89 toks/s, output: 109.81 toks/s]

Here is a list of the top 10 scientists from the 15th century, including Leonardo da Vinci:

1. Leonardo da Vinci (April 15, 1452 - May 2, 1519)
2. Johannes Gutenberg (c. 1398 - c. 1468)
3. Johannes Kepler (December 27, 1571 - November 17, 1633)
4. Galileo Galilei (February 15, 1564 - April 7, 1632)
5. Tycho Brahe (November 14, 1543 - October 24, 1601)
6. Andreas Vesalius (June or July, c.1520 - June or July, c.1554)
7. William Shakespeare (April or May, c.1564 - April or May, c.1616)
8. Francis Bacon (January or February, c.1561 - January or February, c.1626)
9. Girolamo Cardano (September or October, c.1500 - September or October, c.1570)
10. Erasmus Roterodamus (c.circa1555)





#### N-SHOT

In [None]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = LLM(model=MODEL_NAME, tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

In [17]:
# zero-shot
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:20<00:00,  6.18it/s]


0.228

In [17]:
# zero-shot (instruct)
outputs = llm.generate(chat_inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:07<00:00,  7.41it/s]


0.454

In [None]:
# 5-shot
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]
few_shot_prompt = "\n\n".join(few_shot_examples)
inputs = [few_shot_prompt + "\n\n" + f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<stop>"], 
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts:   9%|▉         | 47/500 [01:06<02:45,  2.73it/s] 

In [30]:
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]

In [32]:
few_shot_prompt = "\n\n".join(few_shot_examples)

In [37]:
def fewshot_chat_input(question, answer=None):
    messages = [
        {"role": "system", "content": f"You are an AI assistant that excels in solving math problems. Here are few examples of math problems:\n\n{few_shot_prompt}"},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [38]:
fewshot_chat_inputs = [fewshot_chat_input(question)  for question in valid_dataset['question']]

In [40]:
outputs = llm.generate(fewshot_chat_inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<|eot_id|>"],
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [02:32<00:00,  3.27it/s]


0.452