In [1]:
import argparse
import json
import os
import time
import math
import numpy as np
import pandas as pd
import re

import pandas as pd
import tensor_parallel as tp
import torch
from torch.distributions import Categorical
from tqdm import tqdm, tqdm_notebook
from scipy.stats import entropy
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM
from  transformers.generation.logits_process import LogitsProcessorList
from  transformers.generation.stopping_criteria import StoppingCriteriaList, MaxLengthCriteria
import transformers
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

TASKS = [
        'abstract_algebra',
        'anatomy',
        'astronomy',
        'business_ethics',
        'clinical_knowledge',
        'college_biology',
        'college_chemistry',
        'college_computer_science',
        'college_mathematics',
        'college_medicine',
        'college_physics',
        'computer_security',
        'conceptual_physics',
        'econometrics',
        'electrical_engineering',
        'elementary_mathematics',
        'formal_logic',
        'global_facts',
        'high_school_biology',
        'high_school_chemistry',
        'high_school_computer_science',
        'high_school_european_history',
        'high_school_geography',
        'high_school_government_and_politics',
        'high_school_macroeconomics',
        'high_school_mathematics',
        'high_school_microeconomics',
        'high_school_physics',
        'high_school_psychology',
        'high_school_statistics',
        'high_school_us_history',
        'high_school_world_history',
        'human_aging',
        'human_sexuality',
        'international_law',
        'jurisprudence',
        'logical_fallacies',
        'machine_learning',
        'management',
        'marketing',
        'medical_genetics',
        'miscellaneous',
        'moral_disputes',
        'moral_scenarios',
        'nutrition',
        'philosophy',
        'prehistory',
        'professional_accounting',
        'professional_law',
        'professional_medicine',
        'professional_psychology',
        'public_relations',
        'security_studies', 
        'sociology',
        'us_foreign_policy',
        'virology',
        'world_religions']

choices = ["A", "B", "C", "D"]

def compute_metric(output_filename):
    with open(output_filename, 'r') as f:
        run_results = json.load(f)
    total_acc = 0
    total_num = 0
    for task in run_results:
        acc = 0
        pred_answers = run_results[task]['pred_answers']
        gold_answers = run_results[task]['gold_answers']
        for pred, gold in zip(pred_answers, gold_answers):
            if pred == gold: acc += 1
        print("ACC-%s: %.4f" % (task, acc/len(gold_answers)))
        total_acc += acc
        total_num += len(gold_answers)
    print("ACC-all: %.4f" % (total_acc/total_num))


def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s

def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2
    for j in range(k):
        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
    return prompt

def gen_prompt(train_df, subject, k=-1):
    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject))
    if k == -1:
        k = train_df.shape[0]
    for i in range(k):
        prompt += format_example(train_df, i)
    return prompt


# def custom_stopping_criteria(input_ids, score, **kwargs):
#     stop_ids = [29871, 13, 13] # \n\n 
#     return input_ids[-len(stop_ids)]

def prepare_input(tokenizer, prompts):
    input_tokens = tokenizer.batch_encode_plus(prompts, return_tensors="pt", padding=True)
    input_tokens = {k:input_tokens[k] for k in input_tokens if k in ["input_ids", "attention_mask"]}
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to('cuda')

    return input_tokens

def load(ckpt_dir, model_type):
    n_gpus = torch.cuda.device_count()

    if model_type == 'llama':
        # we use tensor parallel for loading llama
        tokenizer = LlamaTokenizer.from_pretrained(ckpt_dir, use_fast=False, padding_side="left")
        
        model = LlamaForCausalLM.from_pretrained(ckpt_dir, attention_dropout=.1, low_cpu_mem_usage = True, torch_dtype=torch.float16)
        model = tp.tensor_parallel(model, [i for i in range(n_gpus)])

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'flan':
        # we use tensor parallel for loading llama

        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, use_fast=False, padding_side="left")
        
        model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, low_cpu_mem_usage = True, torch_dtype=torch.float16)
        model = tp.tensor_parallel(model, [i for i in range(n_gpus)])

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'falcon':
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', torch_dtype=torch.bfloat16, trust_remote_code=True)

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'moss':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        config = AutoConfig.from_pretrained(ckpt_dir, trust_remote_code=True)
        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16, trust_remote_code=True)
        model.tie_weights()
        model = load_checkpoint_and_dispatch(model, model_path, device_map="auto", no_split_module_classes=["MossBlock"], dtype=torch.float16)
        
        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'guanaco':



        model_name = "llama-65b"
        adapters_name = 'guanaco-65b'

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            load_in_4bit=True,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            max_memory= {i: '24000MB' for i in range(torch.cuda.device_count())},
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
            ),
        )
        model = PeftModel.from_pretrained(model, adapters_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)


    elif model_type == 'vicuna':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', revision="main", trust_remote_code=False)


        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
        
    elif model_type == 'starcoder':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir,device_map = 'balanced_low_0', trust_remote_code=True)
        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
        

    else:
        # mpt-30b's tokenizer only has the fast version
        use_fast = "mosaicml/mpt-30b" in ckpt_dir
        # however, tensor parallel for running falcon will occur bugs
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, use_fast = use_fast, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', torch_dtype=torch.bfloat16, trust_remote_code=True)
        if tokenizer.pad_token_id is None:
            if tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            else:
                tokenizer.pad_token_id = 0


    model.eval()

    return model, tokenizer

def batch_split(prompts, batch_num):
    batch_prompts = []
    mini_batch = []
    for prompt in prompts:
        mini_batch.append(prompt)
        if len(mini_batch) == batch_num:
            batch_prompts.append(mini_batch)
            mini_batch = []
    if len(mini_batch) != 0:
        batch_prompts.append(mini_batch)
    return batch_prompts


def batch_infer(model, tokenizer, prompts):
    batch_size = 8
    answers = []
    for batch_input in tqdm(batch_split(prompts, batch_size)):
        encode_inputs = prepare_input(tokenizer, batch_input)
        outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)
        answers.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    answers = [answer[-1] for answer in answers]
    return answers

def confidence_infer(model, tokenizer, prompts, token_confidence_funcs, confidence_aggregation_funcs, sequence_confidence_funcs):
    answers = []
    confidences = []
    for prompt in tqdm(prompts):
        answer, confidence_dict = generate_with_confidence(model, tokenizer, prompt, 1, token_confidence_funcs, confidence_aggregation_funcs, sequence_confidence_funcs )
        
        answers.extend(answer)
        confidences.extend(confidence_dict)
        
    return answers,confidences



def min_confidence_agg(values, all_ids, model):
    return 'min', min(values)
def max_confidence_agg(values, all_ids, model):
    return 'max', max(values)
def avg_confidence_agg(values, all_ids, model):
    return 'avg', torch.mean(torch.cat(tuple([v.unsqueeze(0) for v in values])),dim = -1)
def attention_weighted_agg(values, all_ids, model):
    #based on Guan et. al. 2023 Shifting Attention to Relevance
    with torch.no_grad():
        model_inputs = model.prepare_inputs_for_generation(all_ids, )
        outputs = model(
            **model_inputs,
                return_dict=True,
                output_attentions=True,
                output_hidden_states=False,
        )
    attn_weights = outputs['attentions'][0][-1,-1,-1,-1*len(values):]
    return 'attention_weighted', torch.dot(torch.cat(tuple([v.unsqueeze(0) for v in values])).half() ,attn_weights)/torch.sum(attn_weights)

def logit_confidence(next_tokens_scores, next_token, all_ids, model):
    return 'logit', torch.max(next_tokens_scores[-1,-1,:])
    
def softmax_confidence(next_tokens_scores, next_token, all_ids, model):
    return 'softmax' , torch.max(torch.nn.functional.softmax(next_tokens_scores[-1,-1,:], dim = -1))
    
def entropy_confidence(next_tokens_scores, next_token, all_ids, model):
    return 'entropy' , Categorical(probs=torch.nn.functional.softmax(next_tokens_scores[-1,-1,:], dim = -1)).entropy()

def ensemble_entropy_confidence(next_tokens_scores, next_token, all_ids, model, n_ensemble=5):
    model.train()
    estimate_vector = torch.zeros(next_tokens_scores.shape[-1], dtype = torch.float32, device = model.device)
    logits_processor = LogitsProcessorList()

    for i in range(n_ensemble):
        
        with torch.no_grad():
            model_inputs = model.prepare_inputs_for_generation(all_ids, )
            outputs = model(
                **model_inputs,
                    return_dict=True,
                    output_attentions=False,
                    output_hidden_states=False,
            )
        next_tokens_scores = logits_processor(outputs['logits'][:,-1,:], outputs['logits'])
        next_token = torch.argmax(next_tokens_scores[:,-1,:])
        estimate_vector[next_token.item()] += 1
        
    estimate_vector = estimate_vector/n_ensemble # probabilities should sum to 1
    model.eval()
    return 'ensemble_entropy' , Categorical(probs=estimate_vector).entropy() 

def MMLU_self_reflection_confidence_promptv1(n_prompt_tokens,all_ids, model, tokenizer, prompt):
    proposed_answer = tokenizer.decode(all_ids[-1,-1*(all_ids.shape[-1] - n_prompt_tokens):])
    prompt = prompt[:-7] # remove the "Answer:" at the end of the prompt
    prompt = prompt + f'''Proposed Answer: {proposed_answer}
Is the proposed answer:
(A) True
(B) False
The proposed answer is: '''
    encode_inputs = prepare_prompt(tokenizer, prompt)
    outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)

    return 'self_reflection_promptv1',1 if outputs[-1,-1].item() == tokenizer.encode('A')[-1] else 0 #return 1 if model outputs 'A' else 0

def prepare_prompt(tokenizer, prompt):
    input_tokens = tokenizer.encode_plus(prompt, return_tensors="pt", padding=True)
    input_tokens = {
            k:input_tokens[k] for k in input_tokens if k in ["input_ids", "attention_mask"]
    }
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to('cuda')
    return input_tokens




def generate_with_confidence(model, tokenizer, prompt, max_new_tokens, token_confidence_funcs, confidence_aggregation_funcs, sequence_confidence_funcs ):
    all_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) 
    n_prompt_tokens = all_ids.shape[-1]
    logits_processor = LogitsProcessorList()
    stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=n_prompt_tokens+max_new_tokens)])
    pad_token_id = model.generation_config.pad_token_id
    eos_token_id = model.generation_config.eos_token_id

    all_token_confidences = {}
    sequence_confidences = {}

    while True:

        with torch.no_grad():
            model_inputs = model.prepare_inputs_for_generation(all_ids, )
        

            outputs = model(
                **model_inputs,
                    return_dict=True,
                    output_attentions=False,
                    output_hidden_states=False,
            )
        
        next_tokens_scores = logits_processor(outputs['logits'][:,-1,:], outputs['logits'])
        next_token = torch.argmax(next_tokens_scores[:,-1,:])

        token_confidences = { id: val for id, val in [func(next_tokens_scores, next_token, all_ids, model) for func in token_confidence_funcs]}

        for id, val in token_confidences.items():
            all_token_confidences[id] = all_token_confidences.get(id,[])
            all_token_confidences[id].append(val)
        
        all_ids = torch.cat([all_ids,next_token.unsqueeze(0).unsqueeze(0)],axis = -1)

        if stopping_criteria(all_ids, next_tokens_scores):
            break


    for func in confidence_aggregation_funcs:
        for token_confidence_id , values in all_token_confidences.items():
            agg_id , val = func(values, all_ids, model)
            sequence_confidences[agg_id+'|'+token_confidence_id] = val

    for func in sequence_confidence_funcs:
        seq_confidence_id , val = func(n_prompt_tokens,all_ids, model, tokenizer, prompt)
        sequence_confidences[seq_confidence_id] = val       

    
    answer = tokenizer.decode(all_ids[-1,-1*max_new_tokens:])
    return [answer], sequence_confidences




In [None]:
ckpt_dir = 'models/Llama-2-7b-hf/'
model_type = 'llama'
model, tokenizer = load(ckpt_dir, model_type)

In [None]:
ckpt_dir = 'models/Llama-2-7b-hf/'
param_size = '7'
model_type = 'llama'
data_dir = 'benchmarks/MMLU/'
benchmark = 'MMLU'
ntrain = 5

run_results = {}
output_filename = 'testrun_results_%s_%s_%sb.json' % (benchmark, model_type, param_size)

confidence_aggregation_funcs = [min_confidence_agg,max_confidence_agg,avg_confidence_agg,attention_weighted_agg]
token_confidence_funcs = [logit_confidence,softmax_confidence,entropy_confidence, ensemble_entropy_confidence]
sequence_confidence_funcs = [MMLU_self_reflection_confidence_promptv1, ]


#model, tokenizer = load(ckpt_dir, model_type)
start_time = time.time()
for task in TASKS[:1]:
    print('Testing %s ...' % task)
    records = []
    dev_df = pd.read_csv(os.path.join(data_dir, "dev", task + "_dev.csv"), header=None)[:ntrain]
    test_df = pd.read_csv(os.path.join(data_dir, "test", task + "_test.csv"), header=None)
    for i in range(test_df.shape[0]):
        # get prompt and make sure it fits
        k = ntrain
        prompt_end = format_example(test_df, i, include_answer=False)
        train_prompt = gen_prompt(dev_df, task, k)
        prompt = train_prompt + prompt_end
        while len(tokenizer.tokenize(prompt)) + 1> 2048: # bos token
            prompt_split = prompt.split("\n\n")
            prompt_split.pop(1)
            prompt = '\n\n'.join(prompt_split)
        label = test_df.iloc[i, test_df.shape[1]-1]
        records.append({'prompt':prompt, 'answer':label})

    pred_answers, confidences = confidence_infer(model, tokenizer, [record['prompt'] for record in records],token_confidence_funcs, confidence_aggregation_funcs, sequence_confidence_funcs)
    gold_answers = [record['answer'] for record in records]
    run_results[task] = {'pred_answers':pred_answers, 'gold_answers':gold_answers, 'confidences' :confidences}
with open(output_filename, 'w') as f:
    json.dump(run_results, f, ensure_ascii=False, indent=2)

compute_metric(output_filename)
end_time = time.time()
print("total run time %.2f" % (end_time - start_time))





Testing abstract_algebra ...


 74%|███████▍  | 74/100 [01:58<00:40,  1.54s/it]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
max_new_tokens = 1
all_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) 
n_prompt_tokens = all_ids.shape[-1]
logits_processor = LogitsProcessorList()
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=n_prompt_tokens+max_new_tokens)])
pad_token_id = model.generation_config.pad_token_id
eos_token_id = model.generation_config.eos_token_id

all_token_confidences = {}
sequence_confidences = {}


while True:

    with torch.no_grad():
        model_inputs = model.prepare_inputs_for_generation(all_ids, )
    

        outputs = model(
            **model_inputs,
                return_dict=True,
                output_attentions=False,
                output_hidden_states=False,
        )
    
    next_tokens_scores = logits_processor(outputs['logits'][:,-1,:], outputs['logits'])
    next_token = torch.argmax(next_tokens_scores[:,-1,:])

    token_confidences = { id: val for id, val in [func(next_tokens_scores, next_token, all_ids, model) for func in token_confidence_funcs]}

    for id, val in token_confidences.items():
        all_token_confidences[id] = all_token_confidences.get(id,[])
        all_token_confidences[id].append(val)
    
    all_ids = torch.cat([all_ids,next_token.unsqueeze(0).unsqueeze(0)],axis = -1)

    if stopping_criteria(all_ids, next_tokens_scores):
        break


for func in confidence_aggregation_funcs:
    for token_confidence_id , values in all_token_confidences.items():
        agg_id , val = func(values, all_ids, model)
        sequence_confidences[agg_id+'|'+token_confidence_id] = val

for func in sequence_confidence_funcs:
    seq_confidence_id , val = func(n_prompt_tokens,all_ids, model, tokenizer, prompt)
    sequence_confidences[seq_confidence_id] = val       


answer = tokenizer.decode(all_ids[-1,-1*max_new_tokens:])
print( [answer], sequence_confidences)






['B'] {'min|logit': tensor(23.6562, device='cuda:0'), 'min|softmax': tensor(0.5079, device='cuda:0'), 'min|entropy': tensor(1.2472, device='cuda:0'), 'min|ensemble_entropy': tensor(1.1921e-07, device='cuda:0'), 'max|logit': tensor(23.6562, device='cuda:0'), 'max|softmax': tensor(0.5079, device='cuda:0'), 'max|entropy': tensor(1.2472, device='cuda:0'), 'max|ensemble_entropy': tensor(1.1921e-07, device='cuda:0'), 'avg|logit': tensor(23.6562, device='cuda:0'), 'avg|softmax': tensor(0.5079, device='cuda:0'), 'avg|entropy': tensor(1.2472, device='cuda:0'), 'avg|ensemble_entropy': tensor(1.1921e-07, device='cuda:0'), 'attention_weighted|logit': tensor(23.6406, device='cuda:0', dtype=torch.float16), 'attention_weighted|softmax': tensor(0.5078, device='cuda:0', dtype=torch.float16), 'attention_weighted|entropy': tensor(1.2471, device='cuda:0', dtype=torch.float16), 'attention_weighted|ensemble_entropy': tensor(0., device='cuda:0', dtype=torch.float16), 'self_reflection_promptv1': 0}


In [141]:
tokenizer.decode(all_ids.squeeze())

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [135]:
print(1)

1


In [102]:
proposed_answer = tokenizer.decode(all_ids[-1*(all_ids.shape[-1] - n_prompt_tokens):])


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [114]:
tokenizer.decode(all_ids[:,-1*(all_ids.shape[-1] - n_prompt_tokens):])

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [130]:
all_ids

tensor([[    1,   450,  1494,   526,  2999,  7348,  5155,   313,  2541,  6089,
         29897,  1048, 29871,  9846,  9623, 29889,    13,    13, 12542,   599,
           274,   297,   796, 29918, 29941,  1316,   393,   796, 29918, 29941,
         29961, 29916, 29962, 14571, 29916, 29985, 29906,   718,   274, 29897,
           338,   263,  1746, 29889,    13, 29909, 29889, 29871, 29900,    13,
         29933, 29889, 29871, 29896,    13, 29907, 29889, 29871, 29906,    13,
         29928, 29889, 29871, 29941,    13, 22550, 29901,   350,    13,    13,
         14473, 29871, 29896,   891,   960,   263, 29950,   338,   385,  1543,
           310,   263,  7329,  2318, 29892,   769,   891, 29874, 29950, 29989,
          1933,  2247,   891, 29874, 29989, 29889,  6666,   882, 29871, 29906,
           891,   960,   379,   322,   476,   526,  1014, 13155,   310,   402,
           769,   379, 29968,   338,   263, 24410,   310,   402, 29889,    13,
         29909, 29889,  5852, 29892,  5852,    13, 2

In [127]:
tokenizer.decode(all_ids)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [33]:
torch.sum(next_tokens_scores[-1,-1,:])

tensor(93634.0234, device='cuda:0')

In [6]:
def load(ckpt_dir, model_type):
    n_gpus = torch.cuda.device_count()

    if model_type == 'llama':
        # we use tensor parallel for loading llama
        tokenizer = LlamaTokenizer.from_pretrained(ckpt_dir, use_fast=False, padding_side="left")
        
        model = LlamaForCausalLM.from_pretrained(ckpt_dir, low_cpu_mem_usage = True, torch_dtype=torch.float16)
        model = tp.tensor_parallel(model, [i for i in range(n_gpus)])

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'flan':
        # we use tensor parallel for loading llama

        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, use_fast=False, padding_side="left")
        
        model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, low_cpu_mem_usage = True, torch_dtype=torch.float16)
        model = tp.tensor_parallel(model, [i for i in range(n_gpus)])

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'falcon':
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', torch_dtype=torch.bfloat16, trust_remote_code=True)

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'moss':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        config = AutoConfig.from_pretrained(ckpt_dir, trust_remote_code=True)
        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16, trust_remote_code=True)
        model.tie_weights()
        model = load_checkpoint_and_dispatch(model, model_path, device_map="auto", no_split_module_classes=["MossBlock"], dtype=torch.float16)
        
        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'guanaco':



        model_name = "llama-65b"
        adapters_name = 'guanaco-65b'

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            load_in_4bit=True,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            max_memory= {i: '24000MB' for i in range(torch.cuda.device_count())},
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
            ),
        )
        model = PeftModel.from_pretrained(model, adapters_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)


    elif model_type == 'vicuna':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', revision="main", trust_remote_code=False)


        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
        
    elif model_type == 'starcoder':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir,device_map = 'balanced_low_0', trust_remote_code=True)
        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
        

    else:
        # mpt-30b's tokenizer only has the fast version
        use_fast = "mosaicml/mpt-30b" in ckpt_dir
        # however, tensor parallel for running falcon will occur bugs
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, use_fast = use_fast, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', torch_dtype=torch.bfloat16, trust_remote_code=True)
        if tokenizer.pad_token_id is None:
            if tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            else:
                tokenizer.pad_token_id = 0


    model.eval()

    return model, tokenizer

def token_level_confidence():
    pass


def infer_with_confidence(model, tokenizer, prompts, token_confidence_funcs, confidence_aggregation_funcs):
    batch_size = 1
    answers = []
    for batch_input in tqdm(batch_split(prompts, batch_size)):
        encode_inputs = prepare_input(tokenizer, batch_input)
        
        outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)
        answers.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    answers = [answer[-1] for answer in answers]
    return answers



In [7]:
model_type = 'llama'
model_dir = 'models/Llama-2-7b-hf'
param_size = '7'
data_dir = 'benchmarks/MMLU/'
ntrain = 5
#model, tokenizer = load(model_dir, model_type)

In [8]:
tokenizer = LlamaTokenizer.from_pretrained(model_dir, use_fast=False, padding_side="left")
        


In [9]:
model = LlamaForCausalLM.from_pretrained(model_dir, attention_dropout=.1,  low_cpu_mem_usage = True, torch_dtype=torch.float16)
model.to('cuda')
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1
model.train() # turns on attention dropout, alternatively, we could set training = True for all the attention layers

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Lin

In [12]:
model.eval()
print(model.training)
model.train()
print(model.training)


False
True


In [13]:
x = torch.zeros(1,1,10, dtype = torch.float32, device = 'cuda')
x

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]], device='cuda:0')

In [16]:
x = x/2
x

tensor([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]]], device='cuda:0')

In [76]:

def min_confidence_agg(values, all_ids, model):
    return torch.min(values,dim = -1)
def max_confidence_agg(values, all_ids, model):
    return torch.max(values,dim = -1)
def avg_confidence_agg(values, all_ids, model):
    return torch.mean(values,dim = -1)
def attention_weighted_agg(values, all_ids, model):
    #based on Guan et. al. 2023 Shifting Attention to Relevance
    with torch.no_grad():
        model_inputs = model.prepare_inputs_for_generation(all_ids, )
        outputs = model(
            **model_inputs,
                return_dict=True,
                output_attentions=True,
                output_hidden_states=False,
        )
    attn_weights = outputs['attentions'][0][-1,-1,-1,:]
    return torch.dot(values,attn_weights)

def logit_confidence(next_tokens_scores, next_token, all_ids, model):
    return torch.max(next_tokens_scores,dim = -1)
    
def softmax_confidence(next_tokens_scores, next_token, all_ids, model):
    return torch.max(torch.nn.functional.softmax(next_tokens_scores[-1,-1,:], dim = -1))
    
def entropy_confidence(next_tokens_scores, next_token, all_ids, model):
    return Categorical(probs=torch.nn.functional.softmax(next_tokens_scores[-1,-1,:], dim = -1)).entropy()

def ensemble_entropy_confidence(next_tokens_scores, next_token, all_ids, model, n_ensemble=5):
    model.train()
    estimate_vector = torch.zeros(next_tokens_scores.shape[-1], dtype = torch.float32, device = model.device)
    for i in range(n_ensemble):
        
        with torch.no_grad():
            model_inputs = model.prepare_inputs_for_generation(all_ids, )
            outputs = model(
                **model_inputs,
                    return_dict=True,
                    output_attentions=False,
                    output_hidden_states=False,
            )
        next_tokens_scores = logits_processor(outputs['logits'][:,-1,:], outputs['logits'])
        next_token = torch.argmax(next_tokens_scores[:,-1,:])
        estimate_vector[next_token.item()] += 1
        
    estimate_vector = estimate_vector/n_ensemble # probabilities should sum to 1
    model.eval()
    return Categorical(probs=estimate_vector).entropy() 

def MMLU_self_reflection_confidence_promptv1(all_ids, n_prompt_tokens, model, tokenizer, prompt):
    proposed_answer = tokenizer.decode(all_ids[-1*(all_ids.shape[-1] - n_prompt_tokens):])
    prompt = prompt[:-7] # remove the "Answer:" at the end of the prompt
    prompt = prompt + f'''Proposed Answer: {proposed_answer}
Is the proposed answer:
(A) True
(B) False
The proposed answer is: '''
    encode_inputs = prepare_input(tokenizer, batch_input)
    outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)

    return 1 if outputs[-1,-1].item() == tokenizer.encode('A')[-1] else 0 #return 1 if model outputs 'A' else 0


confidence_aggregation_funcs = [min_confidence_agg,max_confidence_agg,avg_confidence_agg,attention_weighted_agg]
token_confidence_funcs = [logit_confidence,softmax_confidence,entropy_confidence, ensemble_entropy_confidence]
sequence_confidence_funcs = [MMLU_self_reflection_confidence_promptv1, ]


def prepare_prompt(tokenizer, prompt):
    input_tokens = tokenizer.encode_plus(prompt, return_tensors="pt", padding=True)
    input_tokens = {
            k:input_tokens[k] for k in input_tokens if k in ["input_ids", "attention_mask"]
    }
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to('cuda')
    return input_tokens

def generate_with_confidence_funcs(model, tokenizer, prompt, max_new_tokens, token_confidence_funcs, confidence_aggregation_funcs, sequence_confidence_funcs ):
    all_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) 
    n_prompt_tokens = all_ids.shape[-1]
    logits_processor = LogitsProcessorList()
    stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=n_prompt_tokens+max_new_tokens)])
    pad_token_id = model.generation_config.pad_token_id
    eos_token_id = model.generation_config.eos_token_id

    all_token_confidences = {}
    sequence_confidences = {}

    while True:

        with torch.no_grad():
            model_inputs = model.prepare_inputs_for_generation(all_ids, )
        

            outputs = model(
                **model_inputs,
                    return_dict=True,
                    output_attentions=False,
                    output_hidden_states=False,
            )
        
        next_tokens_scores = logits_processor(outputs['logits'][:,-1,:], outputs['logits'])
        next_token = torch.argmax(next_tokens_scores[:,-1,:])

        token_confidences = { id: val for id, val in [func(next_tokens_scores, next_token, all_ids, model) for func in token_confidence_funcs]}

        for id, val in token_confidences.items():
            all_token_confidences[id] = all_token_confidences.get(id,[])
            all_token_confidences[id].append(val)
        
        all_ids = torch.cat([all_ids,next_token.unsqueeze(0)],axis = -1)

        if stopping_criteria(all_ids, next_tokens_scores):
            break
    answer = tokenizer.decode(all_ids)[-1]


    for func in confidence_aggregation_funcs:
        for token_confidence_id , values in all_token_confidences.items():
            agg_id , val = func(values, all_ids, model)
            sequence_confidences[agg_id+'|'+token_confidence_id] = val

    for func in sequence_confidence_funcs:
        seq_confidence_id , val = func(next_tokens_scores, next_token,all_ids, model, tokenizer, prompt)
        sequence_confidences[seq_confidence_id] = val       

    return [answer], sequence_confidences


    



In [27]:
prompt = 'ARLAKJDFLJKASDJKL:DASJ:KLDLjk;'


In [75]:
encode_inputs['input_ids'][-1,-1].item() == tokenizer.encode('B')[-1]

True

In [71]:
tokenizer.decode(encode_inputs['input_ids'][-1,-1])

'B'

In [74]:
tokenizer.encode('B')[-1]

350

In [77]:
outputs

tensor([[    1,   319,  2241, 22311, 29967,  4037, 29931, 29967, 29968,  3289,
         29928, 29967, 29968, 29931, 29901, 29928,  3289, 29967, 29901, 29968,
         10249, 29931, 25467, 29936,   350, 29936]], device='cuda:0')

In [17]:
outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)


NameError: name 'next_token' is not defined

In [65]:
prompt = 'ARLAKJDFLJKASDJKL:DASJ:KLDLjk; B'
encode_inputs = prepare_prompt(tokenizer,prompt)
outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)
outputs

tensor([[    1,   319,  2241, 22311, 29967,  4037, 29931, 29967, 29968,  3289,
         29928, 29967, 29968, 29931, 29901, 29928,  3289, 29967, 29901, 29968,
         10249, 29931, 25467, 29936,   350, 29936]], device='cuda:0')

In [61]:
tokenizer.decode(encode_inputs['input_ids'][-1,-1])

';'

In [18]:
prompt = 'THIS IS MY PROMPT MY PROMPT IS GOOD'
all_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda') 
n_tokens = all_ids.shape[-1]

logits_processor = LogitsProcessorList()
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=n_tokens)])
pad_token_id = model.generation_config.pad_token_id
eos_token_id = model.generation_config.eos_token_id





with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(all_ids, )

    outputs = model(
        **model_inputs,
            return_dict=True,
            output_attentions=True,
            output_hidden_states=True,
    )

next_tokens_scores = logits_processor(outputs['logits'][:,-1,:], outputs['logits'])

next_token = torch.argmax(next_tokens_scores[:,-1,:])
generated_token = tokenizer.decode(next_token)
mean_attn_weight = outputs['attentions'][0].squeeze()[:,-1,:].mean(axis = 0)
entropy = Categorical(probs=torch.nn.functional.softmax(next_tokens_scores[-1,-1,:], dim = -1)).entropy()


In [27]:
next_token = torch.argmax(next_tokens_scores[:,-1,:],axis = -1)


In [32]:
torch.cat([all_ids,next_token.unsqueeze(0)],axis = -1)

tensor([[    1,  3446,  3235,  8519, 19519,   349,  3491,  7982, 19519,   349,
          3491,  7982,  8519, 21947, 13668,    13]], device='cuda:0')

In [25]:
next_tokens_scores.shape[-1]

32000

In [74]:
tokenizer.batch_decode(torch.argmax(next_tokens_scores,axis = 2))

['TagsIS IS A FERI FOR WROMPT IS AING M']

In [48]:
outputs['attentions'][0].squeeze()[:,-1,:].mean(axis = 1)

tensor([0.1174, 0.0569, 0.0568, 0.0371, 0.0476, 0.0797, 0.0546, 0.0548, 0.0486,
        0.0847, 0.0648, 0.0690, 0.0579, 0.0759, 0.0941], device='cuda:0',
       dtype=torch.float16)

In [55]:
outputs['logits'][:,-1,:]\

torch.Size([1, 32000])

In [16]:
run_results = {}
output_filename = 'run_results_%s_%sb.json' % (model_type, param_size)

start_time = time.time()
for task in TASKS:
    print('Testing %s ...' % task)
    records = []
    dev_df = pd.read_csv(os.path.join(data_dir, "dev", task + "_dev.csv"), header=None)[:ntrain]
    test_df = pd.read_csv(os.path.join(data_dir, "test", task + "_test.csv"), header=None)
    for i in range(test_df.shape[0]):
        # get prompt and make sure it fits
        k = ntrain
        prompt_end = format_example(test_df, i, include_answer=False)
        train_prompt = gen_prompt(dev_df, task, k)
        prompt = train_prompt + prompt_end
        while len(tokenizer.tokenize(prompt)) + 1> 2048: # bos token
            prompt_split = prompt.split("\n\n")
            prompt_split.pop(1)
            prompt = '\n\n'.join(prompt_split)
        label = test_df.iloc[i, test_df.shape[1]-1]
        records.append({'prompt':prompt, 'answer':label})

    pred_answers = batch_infer(model, tokenizer, [record['prompt'] for record in records])
    gold_answers = [record['answer'] for record in records]
    run_results[task] = {'pred_answers':pred_answers, 'gold_answers':gold_answers}
with open(output_filename, 'w') as f:
    json.dump(run_results, f, ensure_ascii=False, indent=2)

compute_metric(output_filename)
end_time = time.time()
print("total run time %.2f" % (end_time - start_time))



Testing abstract_algebra ...


 21%|██        | 21/100 [00:04<00:16,  4.70it/s]


KeyboardInterrupt: 

In [12]:


def compute_metric(output_filename):
    with open(output_filename, 'r') as f:
        run_results = json.load(f)
    total_acc = 0
    total_num = 0
    for task in run_results:
        acc = 0
        pred_answers = run_results[task]['pred_answers']
        gold_answers = run_results[task]['gold_answers']
        for pred, gold in zip(pred_answers, gold_answers):
            if pred == gold: acc += 1
        print("ACC-%s: %.4f" % (task, acc/len(gold_answers)))
        total_acc += acc
        total_num += len(gold_answers)
    print("ACC-all: %.4f" % (total_acc/total_num))


def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s

def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2
    for j in range(k):
        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
    return prompt

def gen_prompt(train_df, subject, k=-1):
    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject))
    if k == -1:
        k = train_df.shape[0]
    for i in range(k):
        prompt += format_example(train_df, i)
    return prompt


# def custom_stopping_criteria(input_ids, score, **kwargs):
#     stop_ids = [29871, 13, 13] # \n\n 
#     return input_ids[-len(stop_ids)]

def prepare_input(tokenizer, prompts):
    input_tokens = tokenizer.batch_encode_plus(prompts, return_tensors="pt", padding=True)
    input_tokens = {k:input_tokens[k] for k in input_tokens if k in ["input_ids", "attention_mask"]}
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to('cuda')

    return input_tokens

def load(ckpt_dir, model_type):
    n_gpus = torch.cuda.device_count()

    if model_type == 'llama':
        # we use tensor parallel for loading llama
        tokenizer = LlamaTokenizer.from_pretrained(ckpt_dir, use_fast=False, padding_side="left")
        
        model = LlamaForCausalLM.from_pretrained(ckpt_dir, low_cpu_mem_usage = True, torch_dtype=torch.float16)
        model = tp.tensor_parallel(model, [i for i in range(n_gpus)])

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'flan':
        # we use tensor parallel for loading llama

        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, use_fast=False, padding_side="left")
        
        model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, low_cpu_mem_usage = True, torch_dtype=torch.float16)
        model = tp.tensor_parallel(model, [i for i in range(n_gpus)])

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'falcon':
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', torch_dtype=torch.bfloat16, trust_remote_code=True)

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'moss':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        config = AutoConfig.from_pretrained(ckpt_dir, trust_remote_code=True)
        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16, trust_remote_code=True)
        model.tie_weights()
        model = load_checkpoint_and_dispatch(model, model_path, device_map="auto", no_split_module_classes=["MossBlock"], dtype=torch.float16)
        
        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
    elif model_type == 'guanaco':



        model_name = "llama-65b"
        adapters_name = 'guanaco-65b'

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            load_in_4bit=True,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            max_memory= {i: '24000MB' for i in range(torch.cuda.device_count())},
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
            ),
        )
        model = PeftModel.from_pretrained(model, adapters_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)


    elif model_type == 'vicuna':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', revision="main", trust_remote_code=False)


        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
        
    elif model_type == 'starcoder':
        
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir,device_map = 'balanced_low_0', trust_remote_code=True)
        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1
        

    else:
        # mpt-30b's tokenizer only has the fast version
        use_fast = "mosaicml/mpt-30b" in ckpt_dir
        # however, tensor parallel for running falcon will occur bugs
        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, use_fast = use_fast, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(ckpt_dir, device_map = 'balanced_low_0', torch_dtype=torch.bfloat16, trust_remote_code=True)
        if tokenizer.pad_token_id is None:
            if tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            else:
                tokenizer.pad_token_id = 0


    model.eval()

    return model, tokenizer

def batch_split(prompts, batch_num):
    batch_prompts = []
    mini_batch = []
    for prompt in prompts:
        mini_batch.append(prompt)
        if len(mini_batch) == batch_num:
            batch_prompts.append(mini_batch)
            mini_batch = []
    if len(mini_batch) != 0:
        batch_prompts.append(mini_batch)
    return batch_prompts

def batch_infer(model, tokenizer, prompts):
    batch_size = 1
    answers = []
    for batch_input in tqdm(batch_split(prompts, batch_size)):
        encode_inputs = prepare_input(tokenizer, batch_input)
        outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)
        answers.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    answers = [answer[-1] for answer in answers]
    return answers

