In [1]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm.notebook import tqdm
import math
from collections import Counter
import re
import spacy

from transformers import AutoTokenizer, AutoModel
import os
from sentence_transformers.util import cos_sim

os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

import sys
sys.path.append("/home/liujunhui/workspace/proj/semantic_uncertainty/semantic_uncertainty")

import argparse
from uncertainty.utils import utils
from uncertainty.uncertainty_measures.semantic_entropy import predictive_entropy

In [2]:
# 定义参数
args = argparse.Namespace(
    dataset="trivia_qa",
    model_name="Llama-2-7b-chat-8bit",
    model_max_new_tokens=512,
    debug=False,
    entity=None,
    random_seed=10,
    metric='squad',
    compute_accuracy_at_all_temps=True,
    experiment_lot='Unnamed Experiment',
    recompute_accuracy=False,
    train_wandb_runid=None,
    num_eval_samples=10000000000000000000,
    compute_predictive_entropy=True,
    compute_p_ik=True,
    compute_p_ik_answerable=False,
    compute_context_entails_response=False,
    analyze_run=True,
    assign_new_wandb_id=False,
    restore_entity_eval=None,
    restore_entity_train=None,
    condition_on_question=True,
    strict_entailment=True,
    use_all_generations=True,
    use_num_generations=-1,
    entailment_model='deberta',
    entailment_cache_id=None,
    entailment_cache_only=False,
    compute_p_true_in_compute_stage=False,
    reuse_entailment_model=False,
    use_mc_options=False
)

In [3]:
# Initialize model.
model = utils.init_model(args)
# Temperature for first generation is always `0.1`.
temperature = 0.1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# 香农熵
def calculate_entropy(sentence):
    # 使用正则表达式分割，匹配空格和换行符
    words = re.findall(r'\w+|\s|\n|\t', sentence.lower())
    
    # 计算每个单词的频率
    word_counts = Counter(words)
    # print(word_counts)
    total_words = len(words)
    
    # 计算香农熵
    entropy = 0.0
    for count in word_counts.values():
        p = count / total_words
        entropy -= p * math.log2(p)
    
    return entropy

# 困惑度
def calculate_perplexity(sentence):
    entropy = calculate_entropy(sentence)
    perplexity = 2 ** entropy
    return perplexity

# fluency
tokenizer_fluency = AutoTokenizer.from_pretrained("/home/liujunhui/workspace/models/parrot_fluency_model")
model_fluency = AutoModelForSequenceClassification.from_pretrained("/home/liujunhui/workspace/models/parrot_fluency_model")

# toxicity
tokenizer_toxicity = AutoTokenizer.from_pretrained("/home/liujunhui/workspace/models/roberta_toxicity_classifier")
model_toxicity = AutoModelForSequenceClassification.from_pretrained("/home/liujunhui/workspace/models/roberta_toxicity_classifier")

# semantic_similarity
model_semantic_similarity = AutoModel.from_pretrained('/home/css/models/NV-Embed-v2', 
                                                      trust_remote_code=True, 
                                                      device_map="auto",
                                                      torch_dtype='bfloat16')

def analyze_fluency(sentence):

    inputs = tokenizer_fluency(sentence, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model_fluency(**inputs)
    
    logits = outputs.logits
    fluency_score = torch.softmax(logits, dim=1)[0][1].item()

    return fluency_score

def analyze_toxicity(sentence):

    inputs = tokenizer_toxicity(sentence, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model_toxicity(**inputs)
    
    logits = outputs.logits
    toxicity_score = torch.softmax(logits, dim=1)[0][1].item()

    return toxicity_score

def calculate_semantic_similarity(sentence1, sentence2, max_length=32768):
    # 对输入的两个句子进行编码
    embeddings = model_semantic_similarity.encode([sentence1, sentence2], 
                                                  instruction="", 
                                                  max_length=max_length)
    
    # 计算余弦相似度
    similarity = cos_sim(embeddings[0], embeddings[1])
    
    return similarity.item()

Some weights of the model checkpoint at /home/liujunhui/workspace/models/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [5]:
with open("dataset/jfleg.json", "r", encoding="utf-8") as f:
    data = json.load(f)

lst_original = data['original']
lst_wsc = data['wsc']

## 生成评估指标excel，jfleg_data_all.xlsx

In [None]:
flu_lst=[]
tox_lst=[]
entropy_lst=[]
ppl_lst=[]
ss_lst=[]
semantic_entropy_lst=[]

for i in tqdm(range(len(lst_original))):

    ori_text = lst_original[i]
    cor_text = lst_wsc[i]

    flu_lst.append(analyze_fluency(cor_text)-analyze_fluency(ori_text))
    tox_lst.append(analyze_toxicity(cor_text)-analyze_toxicity(ori_text))
    entropy_lst.append(calculate_entropy(cor_text)-calculate_entropy(ori_text))
    ppl_lst.append(calculate_perplexity(cor_text)-calculate_perplexity(ori_text))
    ss_lst.append(calculate_semantic_similarity(ori_text,cor_text))
    semantic_entropy_lst.append(predictive_entropy(model.get_token_log_likelihoods(cor_text)) -
                                 predictive_entropy(model.get_token_log_likelihoods(ori_text)))

# 创建一个包含这四个列表的字典
data = {
    'ori_text':lst_original,
    'cor_text':lst_wsc,
    'fluency': flu_lst,
    'toxicity': tox_lst,
    'Shannon entropy':entropy_lst,
    'perplexity': ppl_lst,
    'semantic entropy':semantic_entropy_lst,
    'semantic similarity':ss_lst
}

# 将字典转换为DataFrame
df = pd.DataFrame(data)

# 保存DataFrame到Excel文件
excel_file_path = 'SHAP/jfleg_data_all1.xlsx'  # 输出的Excel文件路径
df.to_excel(excel_file_path, index=False)

# 保存DataFrame到CSV文件
csv_file_path = 'SHAP/jfleg_data_all1.csv'  # 输出的CSV文件路径
df.to_csv(csv_file_path, index=False, encoding='utf-8')

  0%|          | 0/100 [00:00<?, ?it/s]

  'input_ids': torch.tensor(batch_dict.get('input_ids').to(batch_dict.get('input_ids')).long()),
  self.gen = func(*args, **kwds)


In [8]:
data

{'ori_text': ['So I think we would not be live if our ancestors did not develop siences and tecnologies .',
  "Imagine yourself you are working in factory just to do one thing like put air a on car if they fire you you will be destroyed , becouse you do n't know more than to put air a in car .",
  'For example , they can play football whenever they want , but the olders can not .',
  'While It is true that consumers preffer to buy products with lower prices , when international companies that are already certified begin to send their products to market , people will preffer to consume those goods because the difference in price will probbably not affect them too much .',
  'And young people spend more time on ther lifestyles .',
  'Students can focus on only a few subjects they are intwerested in and they will become experts in those areas .',
  'He thinks differently than others and he has succeded .',
  'These activities make the community a better place to live and include these val