In [1]:
import logging
from tqdm import tqdm
from utils.process_data import get_model_generate
from data_loader.base_loader import BaseLoader
# from data_processor.base_processor import BaseProcessor
from data_loader.cot_loader import CotLoader
from utils.load_config import load_config
import argparse
from utils.load_model import load_model_tokenizer
import data_loader
# import data_processor
from utils.meter import AverageMeter
from utils.process_data import *
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# total_entropy_token = AverageMeter()
def process_data_token_level(model_generate):
    # logging.info(f"{self.name} process data")
    res_entropy = model_generate['entropy']
    # num_input_tokens = res_entropy[0].__len__()
    # num_heads = res_entropy.shape[0]
    mean_entropy = res_entropy[:,1:].mean()
    # total_entropy_token.update(mean_entropy)
    return mean_entropy.item()

In [3]:
# total_entropy = AverageMeter()
def process_data_sen_level(model, tokenizer,data, model_generate,split_words=None):
    # logging.info(f"{self.name} process data")
    res = model_generate['generate']
    input_ids = model_generate['input_ids']
    encoder = get_encoder_k(model,-1)

    # 划分输入        
    if split_words:
        split_tokens = split_sentence(tokenizer=tokenizer,question=data,input_ids=input_ids,split_words=split_words)
    else:
        split_tokens = split_sentence(tokenizer=tokenizer,question=data,input_ids=input_ids)
        
    # 根据句子切分attention矩阵 weight权重，token_ids 权重对应token下标
    weights,token_ids = split_attn_matrix(model,res,split_tokens,soft_max=True)
    
    # 加权计算embedding得到hidden_states
    hidden_states = weighted_hidden_states(weights,token_ids,res)        

    # 计算attention矩阵
    attn_matrix = get_attention_matrix(encoder,hidden_states).squeeze(0).to(torch.float32)
    
    # 计算entropy
    with torch.no_grad():
        sentence_entropy = get_attention_entropy(attn_matrix.cpu())
        mean_sentence_entropy = torch.mean(sentence_entropy,dim=0).squeeze()
    return mean_sentence_entropy.tolist()

In [4]:
class Pipeline:
    def __init__(self, model, tokenizer, model_config, data_loaders:list[BaseLoader]):
        logging.info("Init Pipeline")
        self.model = model
        self.tokenizer = tokenizer 
        self.model_config = model_config
        self.data_loaders = data_loaders
        # self.data_processors = data_processors
        self.min_input_token = 100
        self.max_input_token = 2000
        self.max_sample = 50

    def run(self):
        logging.info("Pipeline start")
        # data_loaders
        for data_loader in self.data_loaders:
            logging.info(f"Data loader {data_loader.name}")
            load_data = data_loader.load_data()
            split_words = data_loader.split_words()
            # init processor
            # for data_processor in self.data_processors:
            #     data_processor.set(data_loader.name)
            index = 0
            # data samples
            for data in load_data:
                inputs = self.tokenizer(data, padding=False, return_tensors='pt')
                num_input_token = inputs['input_ids'].shape[1]
                if num_input_token < self.min_input_token or num_input_token > self.max_input_token:
                    logging.info(f"num_input_token {num_input_token} less than min_input_token {self.min_input_token} or greater than max_input_token {self.max_input_token}")
                    continue
                # pre process
                model_generate = get_model_generate(self.tokenizer,self.model,data,max_new_tokens=1,max_input_token=400,split_words=split_words)
                index += 1

                total_entropy = process_data_token_level(model_generate)
                print(f"{index} total_entropy:",total_entropy)
                # process_data_sen_level(self.model,self.tokenizer,data,model_generate,split_words=split_words)

                if index > self.max_sample:
                    break


In [11]:
parser = argparse.ArgumentParser()
parser.add_argument("--cfg", default="./config/qwen.yaml", help="config file path")
parser.add_argument("--start", default = 2, type=int, help="config file path")
parser.add_argument("--model_cfg", default="./config/models_pz.yaml", help="model config file path")
# args = parser.parse_args()
args =parser.parse_known_args()[0]

# log_f = '%(asctime)s | %(filename)s[line:%(lineno)d] | %(levelname)s | %(message)s'
# logging.basicConfig(level="DEBUG", format=log_f)
# logging.basicConfig(level="INFO", format=log_f)

# load config 
config = load_config(args.cfg)
model_cfg = load_config(args.model_cfg)

model_familys = config['model_familys']
model_configs = []
for key in model_familys:
    model_configs += model_cfg[f"paths_{key}"]

# models
for model_config in model_configs[args.start:args.start+2]:
    model, tokenizer = load_model_tokenizer(model_config=model_config)

    # data loaders + data processors
    # data_loaders = [getattr(data_loader,loader_name)() for loader_name in config['data_loaders']]
    # data_processors = [getattr(data_processor,processor_name)(model, tokenizer, model_config) for processor_name in config['data_processors']]
    # init pipeline
    data_loaders = [getattr(data_loader,config['data_loaders'][-1])()]
    pipeline = Pipeline(model,tokenizer,model_config,data_loaders)
    # run
    pipeline.run()



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1 total_entropy: 0.3948315382003784
2 total_entropy: 0.3926246166229248
3 total_entropy: 0.3896627724170685
4 total_entropy: 0.3960888087749481
5 total_entropy: 0.39916566014289856
6 total_entropy: 0.40624886751174927
7 total_entropy: 0.3913097679615021
8 total_entropy: 0.37745583057403564
9 total_entropy: 0.3804439306259155
10 total_entropy: 0.391831636428833
11 total_entropy: 0.38374462723731995
12 total_entropy: 0.3948315382003784
13 total_entropy: 0.37445247173309326
14 total_entropy: 0.3627113103866577
15 total_entropy: 0.3896627724170685
16 total_entropy: 0.3721194267272949


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1 total_entropy: 0.4672941565513611
2 total_entropy: 0.4699353277683258
3 total_entropy: 0.46484386920928955
4 total_entropy: 0.4744057357311249
5 total_entropy: 0.47569918632507324
6 total_entropy: 0.47659623622894287
7 total_entropy: 0.4679994583129883
8 total_entropy: 0.44949495792388916
9 total_entropy: 0.45339128375053406
10 total_entropy: 0.467337965965271
11 total_entropy: 0.45842763781547546
12 total_entropy: 0.4672941565513611
13 total_entropy: 0.4467427134513855
14 total_entropy: 0.4470313787460327
15 total_entropy: 0.46484386920928955
16 total_entropy: 0.4554743766784668
