Evaluations with zero-shot, 5-shot, full fine-tune, merged LoRA, merged DoRA.

### Imports

In [1]:
import torch
from pathlib import Path
import os, json
from safetensors.torch import save_file
import copy
from tqdm import tqdm
import safetensors
import safetensors.torch
from glob import glob
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
from hqq.core.quantize import HQQLinear, HQQBackend, BaseQuantizeConfig, Quantizer

from hqq.core.quantize import HQQLinear, BaseQuantizeConfig, Quantizer
from hqq.backends.torchao import patch_hqq_to_aoint4
from fastcore.script import *

from accelerate import init_empty_weights
import numpy as np


In [2]:
from datasets import load_from_disk

In [3]:
os.environ['CUDA_DEVICE_ORDER']="PCI_BUS_ID"

### Evaluate

In [3]:
import re
from datasets import load_dataset
from fastcore.parallel import parallel
from vllm import LLM, SamplingParams

In [4]:
def extract_last_number_or_ratio(s):
    # Find all sequences of digits, possibly with leading currency symbols, decimal points, and ratios
    patterns = re.findall(r'[\$€£]?\d+(?:\.\d+)?(?:\:\d+(?:\.\d+)?)?', s)
    
    # Return the last pattern found, or None if there are no matches
    if patterns:
        return patterns[-1]
    else:
        return None

In [5]:
def exact_match_score(preds, labels):
    return sum(p==g for p,g in zip(preds, labels))/len(preds)

In [6]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k")['train'].shuffle(seed=42)
dataset = dataset.select(range(len(dataset)-5000,len(dataset)))
short_answers_gt = parallel(extract_last_number_or_ratio, dataset['answer'], progress=True)

In [7]:
valid_dataset = dataset.select(range(500))

In [8]:
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]

In [9]:
labels = short_answers_gt[:500]

In [10]:
len(inputs), len(labels)

(500, 500)

In [11]:
NUM_GPUS = torch.cuda.device_count(); NUM_GPUS

1

In [12]:
TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [14]:
def convert_to_chat_input(question):
    messages = [
        {"role": "system", "content": "You are an AI assistant that excels in solving math problems."},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [15]:
chat_inputs = [convert_to_chat_input(question) for question in valid_dataset['question']]

In [16]:
len(chat_inputs)

500

#### FINETUNED 

In [17]:
model_dir = "/workspace/models/"

In [18]:
# TODO: Don't cast to float16 for bitblas

In [19]:
# NOTE: 4/2 bit models become very repetitive


# bf16 model # 0.60

# 4bit HQQ (skip_dora_all)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_250/vllm_tinygemm" # 0.578

# 4bit HQQ+DORA
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_250/vllm_tinygemm" # 0.593
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_1000/vllm_tinygemm" # 0.610

# 4/2bit HQQ+DORA
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_250/vllm_bitblas" # 0.290
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_1000/vllm_bitblas" # 0.380

# 4/2bit HQQ+DORA (merged)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_250/merged" # 0.242
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_1000/merged" # 0.350

# 4bit HQQ+DORA+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit-ln/step_250/vllm_tinygemm" # 0.562
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit-ln/step_1000/vllm_tinygemm" # 0.585

# 4bit HQQ+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4bit-ln/step_250/vllm_tinygemm" # 0.605
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4bit-ln/step_1000/vllm_tinygemm" # 0.55

# 4/2bit HQQ+DORA+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_250/vllm_bitblas" # 0.270
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/vllm_bitblas" # 0.42

# 4/2bit HQQ+DORA+LN (merged)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_250/merged" # 0.270
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/merged" # 0.343

# 4(HQQ)bit 2(HQQ+DORA)bit
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit/step_250/vllm_bitblas" # 0.257
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit/step_1000/vllm_bitblas" # 0.330

# 4(HQQ)bit 2(HQQ+DORA)bit+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit-ln/step_250/vllm_bitblas" # 0.265
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit-ln/step_1000/vllm_bitblas" # 0.375

# 4(HQQ)bit 2(HQQ)bit+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-hqq-2bit-ln/step_250/vllm_bitblas" # 0.02
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-hqq-2bit-ln/step_1000/vllm_bitblas" # 0.03


# Model generations become more repetitive with more training steps at temp=0.0.
# Frequency penalty helps with this, but needs more investigation.

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.26

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.24
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.31

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.29

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.30
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.35

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.26
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.35

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.36

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.20
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.32

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.30

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.23
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.34



# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.29

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # -
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.27

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.0
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.0

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.29
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.45

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.37

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.26
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.27

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.28
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.32

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.27
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-64-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.30

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.30



# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.26

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.19
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.07

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.02
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-1e-4-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.07

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.28
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.32

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.41

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.21
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-5e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.20

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_125/vllm_bitblas" # 0.25
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-10-train_layernorms-true/step_250/vllm_bitblas" # 0.31

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_125/vllm_bitblas" # 0.23
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-3-train_layernorms-true/step_250/vllm_bitblas" # 0.36

# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_125/vllm_bitblas" # 0.26
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-512-base_lr-2e-5-lr_div_factor-1-train_layernorms-true/step_250/vllm_bitblas" # 0.31



# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_125/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_250/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_375/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_500/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_125/merged" # 0.34
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_250/merged" # 0.48
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_375/merged" # 0.
MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence/step_500/merged" # 0.



# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_125/vllm_bitblas" # 0.22
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_250/vllm_bitblas" # 0.32
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_375/vllm_bitblas" # 0.32
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq/step_500/vllm_bitblas" # 0.33


# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_125/vllm_bitblas" # 0.28
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_250/vllm_bitblas" # 0.39
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_375/vllm_bitblas" # 0.41
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-loftq-block-influence/step_500/vllm_bitblas" # 0.414


# 4 (128) bit 2 (32) bit not adj 20%
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_125/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_125/merged" # 0.39
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_250/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_250/merged" # 0.45
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_375/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_375/merged" # 0.41
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_500/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_500/merged" # 0.51
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_625/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_625/merged" # 0.49
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_750/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_750/merged" # 0.57
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_875/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_875/merged" # 0.5
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_1000/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_1000/merged" # 0.53

# # 4 (128) bit 2 (32) bit adj 20%
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_125/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_125/merged" # 0.44
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_250/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_250/merged" # 0.36
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_375/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_375/merged" # 0.43
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_500/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-20pct/step_500/merged" # 0.45

# # 4 (128) bit 2 (32) bit not adj 30%
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-30pct/step_125/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-30pct/step_125/merged" # 0.41
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-30pct/step_250/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-30pct/step_250/merged" # 0.48

# # 4 (128) bit 2 (32) bit adj 30%
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-30pct/step_125/vllm_bitblas" # 0.
# MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-lora_rank-256-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-adj-30pct/step_125/merged" # 0.


# MODEL_NAME = "llama-3-1-8b-dora-ablations/no_adj_20_pct_step_750/merged" # 0.54
# MODEL_NAME = "llama-3-1-8b-dora-ablations/no_adj_20_pct_step_750/vllm_bitblas" # 0.53 worked

# MODEL_NAME = "llama-3-1-8b-dora-ablations/adj_20_pct_step500/merged" # 0.46
# MODEL_NAME = "llama-3-1-8b-dora-ablations/adj_20_pct_step500/vllm_bitblas" # 0.46 worked


MODEL_NAME = "llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_875/merged"

MODEL_NAME = os.path.join(model_dir,MODEL_NAME)


# MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [20]:
NUM_GPUS = 1

In [21]:
llm = LLM(
    	  model=MODEL_NAME, 
          tokenizer=TOKENIZER, 
          tensor_parallel_size=NUM_GPUS, 
          max_model_len=8192,
        #   quantization="bitblas",
        #   dtype="float16",
          dtype="bfloat16",
		  gpu_memory_utilization=0.8,
    	#   enforce_eager=True,
          )

INFO 08-30 15:00:16 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/workspace/models/llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_875/merged', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/workspace/models/llama-3-1-8b-instruct-dora-4-2bit-gs-32-l

INFO 08-30 15:00:16 model_runner.py:720] Starting to load model /workspace/models/llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct/step_875/merged...


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


Loading model.embed_tokens.weight
Loaded model.embed_tokens.weight
Loading model.layers.0.input_layernorm.weight
Loaded model.layers.0.input_layernorm.weight
Loading model.layers.0.mlp.down_proj.weight
Loaded model.layers.0.mlp.down_proj.weight
Loading model.layers.0.mlp.gate_proj.weight
Loaded model.layers.0.mlp.gate_up_proj.weight
Loading model.layers.0.mlp.up_proj.weight
Loaded model.layers.0.mlp.gate_up_proj.weight
Loading model.layers.0.post_attention_layernorm.weight
Loaded model.layers.0.post_attention_layernorm.weight
Loading model.layers.0.self_attn.k_proj.weight
Loaded model.layers.0.self_attn.qkv_proj.weight
Loading model.layers.0.self_attn.o_proj.weight
Loaded model.layers.0.self_attn.o_proj.weight
Loading model.layers.0.self_attn.q_proj.weight
Loaded model.layers.0.self_attn.qkv_proj.weight
Loading model.layers.0.self_attn.v_proj.weight
Loaded model.layers.0.self_attn.qkv_proj.weight
Loading model.layers.1.input_layernorm.weight
Loaded model.layers.1.input_layernorm.weight

In [22]:
# base model
# outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))

In [35]:
# chat model
outputs = llm.generate(chat_inputs[:128], SamplingParams(temperature=0.0, max_tokens=1024, stop=["<|eot_id|>"]))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 128/128 [00:19<00:00,  6.43it/s, est. speed input: 695.78 toks/s, output: 1655.81 toks/s] 


0.5859375

In [36]:
output_texts = [(o.outputs[0].text) for o in outputs]

In [46]:
print(output_texts[10])

Let's break down the problem step by step:

1. Jake initially has 120 bitcoins.
2. He invests 40 bitcoins into a venture that doubles his investment. So, he now has 120 + 40 = 160 bitcoins.
3. He donates 25 bitcoins to charity. So, he now has 160 - 25 = 135 bitcoins.
4. He gives half of all the bitcoins in his possession to his brother. So, he gives 135 / 2 = 67.5 bitcoins to his brother. Since we can't have half a bitcoin, we'll round down to 67 bitcoins. So, he now has 135 - 67 = 68 bitcoins.
5. He takes back 5 bitcoins from his brother as payment for an outstanding debt. So, he now has 68 + 5 = 73 bitcoins.
6. He quadruples the number of bitcoins he has. So, he now has 73 * 4 = 292 bitcoins.
7. He donates another 15 bitcoins to a different charity. So, he now has 292 - 15 = 277 bitcoins.

Jake now has 277 bitcoins.


In [30]:
chat_input = [convert_to_chat_input("Write python code for image classification inference using fastai library and give a brief intro about the library. Use import `from fastai.vision.all import *`")]
outputs = llm.generate(chat_input, SamplingParams(temperature=0.0, max_tokens=1024, stop=["<|eot_id|>"],
                                                  frequency_penalty=0., presence_penalty=0.0))
print(outputs[0].outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.94s/it, est. speed input: 20.57 toks/s, output: 77.46 toks/s]

Fastai is a Python library that provides a high-level interface for deep learning and computer vision tasks. It is built on top of popular deep learning frameworks such as PyTorch and TensorFlow, and provides a simple and intuitive API for training and deploying models. Fastai is particularly useful for image classification tasks, as it provides a range of pre-built models and pre-processing techniques that can be used to train models on large datasets.

Here is an example of how to use fastai to perform image classification inference on a pre-trained model:
```python
from fastai.vision.all import *
# Load the pre-trained model
model = load_model('path/to/model.h5')
# Load the input image
image = load_image('path/to/image.jpg')
# Make predictions on the image
predictions = predict(model, image)
# Print the predictions
print(predictions)
```
In this example, we first load the pre-trained model using the `load_model` function. We then load the input image using the `load_image` function.




In [32]:
chat_input = [convert_to_chat_input("List top 10 scientists from the 15th (1400-1500) century as numbered list with their dob and death. Inclde da vinci.")]
outputs = llm.generate(chat_input, SamplingParams(temperature=0.0, max_tokens=256, stop=["<|eot_id|>"],
                                                  frequency_penalty=0.0, presence_penalty=0.0))
print(outputs[0].outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it, est. speed input: 52.96 toks/s, output: 77.84 toks/s]

1. Leonardo da Vinci (1452-1516)
2. Galileo Galilei (1561-1632)
3. Johannes Kepler (1572-1630)
4. Isaac Newton (1642-1727)
5. Antoni van Leeuwenhoek (1632-1700)
6. Robert Boyle (1645-1661)
7. William Shakespeare (1564-1616)
8. William Harvey (1572-1657)
9. William Cullen (1770-1800)
10. Joseph Black (1723-1793)





#### N-SHOT

In [None]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = LLM(model=MODEL_NAME, tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

In [17]:
# zero-shot
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:20<00:00,  6.18it/s]


0.228

In [17]:
# zero-shot (instruct)
outputs = llm.generate(chat_inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:07<00:00,  7.41it/s]


0.454

In [None]:
# 5-shot
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]
few_shot_prompt = "\n\n".join(few_shot_examples)
inputs = [few_shot_prompt + "\n\n" + f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<stop>"], 
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts:   9%|▉         | 47/500 [01:06<02:45,  2.73it/s] 

In [30]:
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]

In [32]:
few_shot_prompt = "\n\n".join(few_shot_examples)

In [37]:
def fewshot_chat_input(question, answer=None):
    messages = [
        {"role": "system", "content": f"You are an AI assistant that excels in solving math problems. Here are few examples of math problems:\n\n{few_shot_prompt}"},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [38]:
fewshot_chat_inputs = [fewshot_chat_input(question)  for question in valid_dataset['question']]

In [40]:
outputs = llm.generate(fewshot_chat_inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<|eot_id|>"],
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [02:32<00:00,  3.27it/s]


0.452

### LMSYS Generation Outputs

In [41]:
generations_path = Path("/workspace/git/kerem_research/evaluation_benchmarking/results/lmsys_hard_generations")

In [42]:
questions = [json.loads(l) for l in open("/workspace/git/kerem_research/evaluation_benchmarking/misc/lmsys_arena_hard_question.jsonl").readlines()]

In [43]:
gens_dict = json.load(open(generations_path/"llama_3_1_8b_instruct.json"))
gens_dict1 = json.load(open(generations_path/"llama_3_1_8b_instruct_4_2_gs32_lora_rank64_bi20pct_step_125.json"))
gens_dict2 = json.load(open(generations_path/"llama_3_1_8b_instruct_4_2_gs32_lora_rank64_bi20pct_step_625.json"))
gens_dict3 = json.load(open(generations_path/"llama_3_1_8b_instruct_4_2_gs32_lora_rank64_bi20pct_step_750.json"))
gens_dict4 = json.load(open(generations_path/"llama_3_1_8b_instruct_4_2_gs32_lora_rank64_bi20pct_step_875.json"))
gens_dict5 = json.load(open(generations_path/"llama_3_1_8b_instruct_4_2_gs32_lora_rank64_bi20pct_step_1000.json"))

In [44]:
idx = 0
print(questions[idx]['turns'][0]['content'])

Use ABC notation to write a melody in the style of a folk tune.


In [45]:
# bf16 model
print(gens_dict[idx]['response'])

Here's a simple folk-style melody in ABC notation:

X:1
T:Simple Folk Tune
M:4/4
L:1/4
K:C

C4 E4 G4 C4 | E4 G4 A4 G4 | F4 E4 D4 C4 | E4 G4 A4 G4 |
C4 E4 G4 C4 | E4 G4 A4 G4 | F4 E4 D4 C4 | E4 G4 A4 G4 |

In this notation:

- X:1 indicates the first line of the tune.
- T:Simple Folk Tune is the title of the tune.
- M:4/4 indicates the time signature (4/4 means four beats per measure).
- L:1/4 indicates the note length (in this case, a quarter note).
- K:C indicates the key (C major).
- The lines after the key indicate the melody, with each note separated by a space.

This melody is a simple folk-style tune with a repeating pattern. You can play it on a instrument or sing it to get a feel for the melody.


In [36]:
# 4/2 bit step 125
print(gens_dict1[idx]['response'])

Here's a C++ solution to the problem:

```cpp
#include <iostream>
#include <string>
using namespace std;

int main() {
    int t;
    cin >> t;
    for (int i = 0; i < t; i++) {
        string s;
        cin >> s;
        string ans = "NO";
        for (int j = 0; j < 3; j++) {
            if (s[j] == 'a') {
                ans = "YES";
                break;
            }
            else if (s[j] == 'b') {
                ans = "YES";
                break;
            }
            else if (s[j] == 'c') {
                ans = "YES";
                break;
            }
        }
        cout << ans << endl;
    }
    return 0;
}
```

The code uses the `cin` statement to read the input string `s` from the user. It then checks if the character at the current index `j` is equal to 'a', 'b', or 'c'. If it is, it sets the `ans` variable to "YES" and breaks out of the loop. If none of the conditions are met, it sets `ans` to "NO". Finally, it outputs the value of `ans` to the console.


In [37]:
# 4/2 bit step 625
print(gens_dict2[idx]['response'])

#include <iostream>
#include <string>
#include <algorithm>

using namespace std;

int main() {
    int t;
    cin >> t;
    for (int i = 0; i < t; i++) {
        string s;
        cin >> s;
        if (s == "abc") {
            cout << "YES" << endl;
        } else {
            int a = 0, b = 0, c = 0;
            for (int j = 0; j < 3; j++) {
                char ch = s[j];
                if (ch == 'a') {
                    a++;
                } else if (ch == 'b') {
                    b++;
                } else if (ch == 'c') {
                    c++;
                }
            }
            if (a == 1 && b == 1 && c == 1) {
                cout << "YES" << endl;
            } else {
                cout << "NO" << endl;
            }
        }
    }
    return 0;
}


In [38]:
# 4/2 bit step 750
print(gens_dict3[idx]['response'])

#include <iostream>
#include <string>
#include <algorithm>

using namespace std;

int main() {
    int t;
    cin >> t;

    for (int i = 0; i < t; i++) {
        string s;
        cin >> s;

        if (s == "abc") {
            cout << "YES\n";
        } else {
            if (s == "acb") {
                cout << "YES\n";
            } else if (s == "bac") {
                cout << "YES\n";
            } else if (s == "bca") {
                cout << "YES\n";
            } else if (s == "cab") {
                cout << "YES\n";
            } else if (s == "cba") {
                cout << "YES\n";
            } else {
                cout << "NO\n";
            }
        }
    }

    return 0;
}


In [39]:
# 4/2 bit step 875
print(gens_dict4[idx]['response'])

#include <iostream>
#include <string>
using namespace std;

int main() {
    int t;
    cin >> t;
    for (int i = 0; i < t; i++) {
        string s;
        cin >> s;
        if (s == "abc") {
            cout << "YES" << endl;
        } else if (s == "acb") {
            cout << "YES" << endl;
        } else if (s == "bac") {
            cout << "YES" << endl;
        } else if (s == "bca") {
            cout << "YES" << endl;
        } else if (s == "cab") {
            cout << "YES" << endl;
        } else if (s == "cba") {
            cout << "YES" << endl;
        } else {
            cout << "NO" << endl;
        }
    }
    return 0;
}


In [40]:
# 4/2 bit step 1000
print(gens_dict5[idx]['response'])

#include <iostream>
#include <string>
#include <algorithm>

using namespace std;

int main() {
    int t;
    cin >> t;
    for (int i = 0; i < t; i++) {
        string s;
        cin >> s;
        if (s == "abc") {
            cout << "YES" << endl;
        } else if (s == "acb") {
            cout << "YES" << endl;
        } else if (s == "bac") {
            cout << "YES" << endl;
        } else if (s == "bca") {
            cout << "NO" << endl;
        } else if (s == "cab") {
            cout << "YES" << endl;
        } else if (s == "cba") {
            cout << "YES" << endl;
        } else {
            cout << "NO" << endl;
        }
    }
    return 0;
}


### Per Token Accuracy

In [1]:
import torch
import safetensors
import shutil
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, load_from_disk
import numpy as np
from fastcore.all import *
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)


@patch
def ls_names(self:Path): return L([o.name for o in self.ls()])
@patch
def glob_names(self:Path, pattern:str): return L([o.name for o in self.glob(pattern)])

In [2]:
models_dir = Path("/workspace/models")
model_prefix = models_dir/"llama-3-1-8b-instruct-dora-4-2bit-gs-32-lora_rank-64-base_lr-5e-5-lr_div_factor-10-train_layernorms-true-block-influence-no-adj-20pct"

In [3]:
model_prefix.ls_names()

(#8) ['step_125','step_250','step_375','step_500','step_625','step_750','step_875','step_1000']

In [4]:
original_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [5]:
original_model_dir = Path(f"/workspace/.cache/huggingface/hub/models--{original_model_name.replace('/', '--')}/snapshots").ls()[0]
# list json files in original model directory.
original_model_dir.glob_names("*.json")

(#6) ['model.safetensors.index.json','config.json','tokenizer_config.json','tokenizer.json','special_tokens_map.json','generation_config.json']

In [6]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(original_model_name)

In [9]:
# Copied from original model directory.
json_files_source_dir = original_model_dir

# Make compatible with `AutoModelForCausalLM.from_pretrained`
STEPS = [125]
for step in STEPS:
	model_path = Path(model_prefix/f"step_{step}/merged_hqq_only")
	weight_files = list(Path(model_path).glob("*.safetensors"))
	print(f"Found {len(weight_files)} weight files.")
	for fn in weight_files:
		weights = safetensors.torch.load_file(fn)
		safetensors.torch.save_file(weights, model_path/fn.name, metadata={'format': 'pt'})
	# Copy all JSON files from source to destination
	for json_file in json_files_source_dir.glob("*.json"):
		shutil.copy(json_file, model_path)
	print("Done.")

Found 4 weight files.
Done.


In [13]:
step = 125
model = AutoModelForCausalLM.from_pretrained(model_prefix/f"step_{step}/merged_hqq_only", torch_dtype="bfloat16").cuda()
# model = AutoModelForCausalLM.from_pretrained(model_prefix/f"step_{step}/merged", torch_dtype="bfloat16").cuda()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
# original_model = AutoModelForCausalLM.from_pretrained(original_model_name, torch_dtype="bfloat16").cuda()

In [50]:
# input_text = "Use ABC notation to write a melody in the style of a folk tune."
# input_text = "Use ABC notation to write a melody in the style of a folk tune. Don't be repetitive."
# input_text = "Use ABC notation to write a melody in the style of a folk tune. Don't be repetitive. You can do it in 128 characters."  # Replace with your actual prompt
# input_text = "Write python code for image classification inference using fastai library and give a brief intro about the library. Use import `from fastai.vision.all import *`"
input_text = "Write python code for image classification inference using fastai library and give a brief intro about the library. Use import `from fastai.vision.all import *`"
system_prompt = "You are useful AI assistant."
assistant_prefill = "```python\n"
# Apply chat template to the input
messages = [{"role":"user", "content":input_text}]
if system_prompt is not None: messages = [{"role":"system", "content":system_prompt}] + messages
if assistant_prefill is not None: messages += [{"role":"assistant", "content":assistant_prefill}]
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=assistant_prefill is None, return_tensors="pt", tokenize=True)
if assistant_prefill is not None: inputs = inputs[0][:-1].unsqueeze(0) # exclude eos token.
inputs = inputs.to(model.device)

output_sequences = model.generate(inputs, max_length=512, do_sample=False, temperature=None, top_p=None, top_k=None)  # top_k=0, top_p=0.95, temperature=0.8
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=False)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are useful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Write python code for image classification inference using fastai library and give a brief intro about the library. Use import `from fastai.vision.all import *`<|eot_id|><|start_header_id|>assistant<|end_header_id|>

```python 1.8 * 1.9 = 1.9 * 1.8; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1.9; 1.9 = 1.8 * 1


In [45]:
def prepare_batch(eval_ds, bidx, batch_size):
	input_ids = []
	answer_lengths = []
	token_lengths = []
	for i in range(batch_size):
		# chat templated input and answer.
		user_prompt, answer = eval_ds[bidx+i]['input_text'], eval_ds[bidx+i]['output_text']
		prompt_tokens = tokenizer.encode(user_prompt, add_special_tokens=False)
		answer_tokens = tokenizer.encode(answer, add_special_tokens=False) + [tokenizer.eos_token_id]
		tokens = torch.tensor(prompt_tokens + answer_tokens)
		input_ids.append(tokens)
		token_lengths.append(len(tokens))		
		answer_lengths.append(len(answer_tokens))
	input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.eos_token_id)
	return input_ids, answer_lengths, token_lengths

In [46]:
#| test
mock_eval_ds = [
	{'input_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
  	 'output_text': 'AI stands for Artificial Intelligence.'},
	{'input_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nDefine machine learning.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
  	 'output_text': 'Machine learning is a subset of AI.'},
	{'input_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExplain deep learning.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
  	 'output_text': 'Deep learning is a type of machine learning.'}]

input_ids, answer_lengths, token_lengths = prepare_batch(mock_eval_ds, 0, 3)
gt_answer_lengths = [len(tokenizer.encode(ex['output_text'], add_special_tokens=False)) + 1 for ex in mock_eval_ds]
assert answer_lengths == gt_answer_lengths
gt_token_lengths = [len(tokenizer.encode(ex['input_text']+ex['output_text'], add_special_tokens=False)) + 1 for ex in mock_eval_ds]
assert token_lengths == gt_token_lengths
assert input_ids.shape[1] == max(token_lengths)
for idx, ex in enumerate(mock_eval_ds):
	label_ids = input_ids[idx][:token_lengths[idx]][-answer_lengths[idx]:]
	decoded_answer = tokenizer.decode(label_ids)
	assert (ex['output_text'] + tokenizer.eos_token) == decoded_answer

In [47]:
import torch
import torch.nn.functional as F

def eval_model(model, eval_ds, batch_size=16):
    # make num samples divisible by batch size
    num_samples = (len(eval_ds) // batch_size) * batch_size
    model.eval()
    accuracies = []
    perplexities = []
    for bidx in tqdm(range(0, num_samples, batch_size)):
        input_ids, answer_lengths, token_lengths = prepare_batch(eval_ds, bidx, batch_size)
        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=None, attention_mask=None, past_key_values=None).logits.to("cpu")
            log_probs = F.log_softmax(outputs, dim=-1)
            preds = outputs.argmax(dim=-1)
        
        for input_id, answer_len, token_len, pred, log_prob in zip(input_ids, answer_lengths, token_lengths, preds, log_probs):
            label = input_id[:token_len][-answer_len:]        
            pred = pred[:(token_len-1)][-answer_len:]
            log_prob = log_prob[:(token_len-1)][-answer_len:]
            
            # Compute per token accuracy.
            acc = (pred == label).float().mean()
            accuracies.append(acc.item())
            
            # Compute perplexity.
            token_log_probs = log_prob[torch.arange(len(label)), label]
            avg_neg_log_prob = -token_log_probs.mean()
            perplexity = torch.exp(avg_neg_log_prob)
            perplexities.append(perplexity.item())
    
    return accuracies, perplexities

In [48]:
eval_ds = load_from_disk("/workspace/data/llama_large_mix_dataset_v0_dedup_eval"); len(eval_ds)

736

In [49]:
accuracies, perplexity = eval_model(model, eval_ds)
np.mean(accuracies), np.mean(perplexity)

  0%|          | 0/46 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 46/46 [02:23<00:00,  3.12s/it]


(0.6123960490702934, 48.19222815091843)