In [4]:
import os
import gc
os.environ['HF_ENDPOINT']='https://hf-mirror.com'
import torch
import wandb
import argparse
import numpy as np
from utils import *
from tqdm import tqdm 
from datasets import load_dataset
from scipy.special import softmax
from torch.utils.data import DataLoader

def parse_args():
    parser = get_parser()
    parser.add_argument("--f")
    parser.add_argument("--generate_method", default=False, type = bool,help="True use model.generate(), otherwise use model.__call__()")
    parser.add_argument("--gammas", default=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], type=list,help="emergency index gamma")
    parser.add_argument("--log_image_interval", default=10, type=int, help="Step interval to log Image")
    parser.add_argument("--entropy_normalize",default=True, type=bool, help="Entropy compution need to divide log(k)")
    
    args = parser.parse_args()
    return args

args = parse_args()


In [22]:
args.model_name = "llama_2"
args.model_type = "13b"
models_cfg = load_config(args.models_cfg)

if args.lora: # Load finetune model by LORA
    model_cfg = models_cfg[args.model_name][args.model_type]
    model, tokenizer = load_lora_model_tokenizer(model_cfg[0],args.lora_model_dir,args.lora_model_name)
else: # Load original model
    model, tokenizer = load_model_tokenizer(models_cfg[args.model_name][args.model_type])

Loading checkpoint shards: 100%|██████████| 3/3 [02:09<00:00, 43.28s/it]


In [12]:
prompt = "Answer the following question as briefly as possible.\n"
dataset = [{"input_tokens":prompt + "9.11 and 9.9, which one is bigger?"},
            {"input_tokens":prompt + "which one is bigger? 9.11 and 9.9"}]
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=2)

In [23]:
step = 0
with torch.no_grad():
    for batch_idx, batch_data in enumerate(dataloader):
        gc.collect()
        torch.cuda.empty_cache()
        # Flitering out some larger data due to CUDA memeory
        num_input_tokens = get_num_input_tokens(tokenizer=tokenizer,input_tokens=batch_data["input_tokens"])
        if num_input_tokens > args.max_num_input_tokens:
            continue
        else:
            print(f"{args.model_name}_{args.model_type}_{args.lora_model_name}_{args.dataset}:[{step}/{args.dataset_size}]")
        if step >= args.dataset_size:
            break
        for input_type in ["input_tokens"]:
            # Model output
            model_output = generate_model_output(model=model,tokenizer=tokenizer,
                                        input_tokens=batch_data[input_type],
                                        generate_method=args.generate_method) # dict = ["input_ids","attentions","hidden_states", "logits"]
            
            # Model logits
            logits = model_output["logits"] # shape = (bs, num_tokens, vocab_size)
            
            # Predict probabilities
            pred_probs = softmax(logits,axis=-1) # shape = (bs, num_tokens, vocab_size)
            del logits
            
            # Naive entropy
            naive_entropys = calculate_naive_entropy(pred_probs,normalize=args.entropy_normalize) # shape = (num_tokens) value belong to [0,1]
            del pred_probs
            
            # Model generate output
            gen_config = GenerationConfig(do_sample=False, 
                                    num_beams=1,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.eos_token_id,
                                    max_new_tokens=10, 
                                    return_dict_in_generate=True,
                                    output_attentions=True, 
                                    output_hidden_states=True,
                                    output_logits=True)
            inputs = tokenizer(batch_data[input_type], padding=False, return_tensors='pt')
            input_ids = inputs['input_ids'].cuda()
            attention_mask = inputs['attention_mask'].cuda()
            output = model.generate(input_ids, attention_mask=attention_mask, generation_config=gen_config)
            sequences = output["sequences"]
            output_text = tokenizer.decode(sequences[0])
            print(f"input: {batch_data[input_type][0]}") 
            print(f"output: {output_text}")
            
            print(f"naive_entropy: {np.mean(naive_entropys)}")
            
        step += 1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


llama_2_13b__Xsum:[0/200]
input: Answer the following question as briefly as possible.
9.11 and 9.9, which one is bigger?
output: <s> Answer the following question as briefly as possible.
9.11 and 9.9, which one is bigger?
9.11 is bigger than 9
naive_entropy: 0.29602377402247737
llama_2_13b__Xsum:[1/200]
input: Answer the following question as briefly as possible.
which one is bigger? 9.11 and 9.9
output: <s> Answer the following question as briefly as possible.
which one is bigger? 9.11 and 9.99999999999
naive_entropy: 0.28383490625174695
