## 生成评估指标excel

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm.notebook import tqdm
import math
from collections import Counter
import re
import spacy

from transformers import AutoTokenizer, AutoModel
import os
from sentence_transformers.util import cos_sim

os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

import sys
sys.path.append("/home/liujunhui/workspace/proj/semantic_uncertainty/semantic_uncertainty")

import argparse
from uncertainty.utils import utils
from uncertainty.uncertainty_measures.semantic_entropy import predictive_entropy

In [2]:
# 定义参数
args = argparse.Namespace(
    dataset="trivia_qa",
    model_name="Llama-2-7b-chat-8bit",
    model_max_new_tokens=512,
    debug=False,
    entity=None,
    random_seed=10,
    metric='squad',
    compute_accuracy_at_all_temps=True,
    experiment_lot='Unnamed Experiment',
    recompute_accuracy=False,
    train_wandb_runid=None,
    num_eval_samples=10000000000000000000,
    compute_predictive_entropy=True,
    compute_p_ik=True,
    compute_p_ik_answerable=False,
    compute_context_entails_response=False,
    analyze_run=True,
    assign_new_wandb_id=False,
    restore_entity_eval=None,
    restore_entity_train=None,
    condition_on_question=True,
    strict_entailment=True,
    use_all_generations=True,
    use_num_generations=-1,
    entailment_model='deberta',
    entailment_cache_id=None,
    entailment_cache_only=False,
    compute_p_true_in_compute_stage=False,
    reuse_entailment_model=False,
    use_mc_options=False
)

In [3]:
# Initialize model.
model = utils.init_model(args)
# Temperature for first generation is always `0.1`.
temperature = 0.1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# 香农熵
def calculate_entropy(sentence):
    # 使用正则表达式分割，匹配空格和换行符
    words = re.findall(r'\w+|\s|\n|\t', sentence.lower())
    
    # 计算每个单词的频率
    word_counts = Counter(words)
    # print(word_counts)
    total_words = len(words)
    
    # 计算香农熵
    entropy = 0.0
    for count in word_counts.values():
        p = count / total_words
        entropy -= p * math.log2(p)
    
    return entropy

# 困惑度
def calculate_perplexity(sentence):
    entropy = calculate_entropy(sentence)
    perplexity = 2 ** entropy
    return perplexity

# fluency
tokenizer_fluency = AutoTokenizer.from_pretrained("/home/liujunhui/workspace/models/parrot_fluency_model")
model_fluency = AutoModelForSequenceClassification.from_pretrained("/home/liujunhui/workspace/models/parrot_fluency_model")

# toxicity
tokenizer_toxicity = AutoTokenizer.from_pretrained("/home/liujunhui/workspace/models/roberta_toxicity_classifier")
model_toxicity = AutoModelForSequenceClassification.from_pretrained("/home/liujunhui/workspace/models/roberta_toxicity_classifier")

# semantic_similarity
#  device_map='auto'
model_semantic_similarity = AutoModel.from_pretrained('/home/css/models/NV-Embed-v2', trust_remote_code=True, device_map='auto')

def analyze_fluency(sentence):

    inputs = tokenizer_fluency(sentence, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model_fluency(**inputs)
    
    logits = outputs.logits
    fluency_score = torch.softmax(logits, dim=1)[0][1].item()

    return fluency_score

def analyze_toxicity(sentence):

    inputs = tokenizer_toxicity(sentence, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model_toxicity(**inputs)
    
    logits = outputs.logits
    toxicity_score = torch.softmax(logits, dim=1)[0][1].item()

    return toxicity_score


def calculate_semantic_similarity(sentence1, sentence2, max_length=32768):

    embeddings = model_semantic_similarity.encode([sentence1, sentence2], instruction="", max_length=max_length)
    
    similarity = cos_sim(embeddings[0], embeddings[1])
    
    return similarity.item()

Some weights of the model checkpoint at /home/liujunhui/workspace/models/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [5]:
# original文本文件路径
file_path_1 = "/home/liujunhui/workspace/proj/WSC/dataset/jfleg/jfleg/sources.txt"
file_path_2 = "/home/liujunhui/workspace/proj/WSC/dataset/jfleg/jfleg/corrections.txt"
# 打开文件并读取内容到列表
with open(file_path_1, "r") as file:
    jfleg_lst = [line.strip() for line in file.readlines()]

with open(file_path_2, "r") as file:
    jfleg_lst.extend([line.strip() for line in file.readlines()])

file_path_3 = "/home/liujunhui/workspace/proj/WSC/dataset/jfleg/jfleg_corrected/sources.txt"
file_path_4 = "/home/liujunhui/workspace/proj/WSC/dataset/jfleg/jfleg_corrected/corrections.txt"

# 打开文件并读取内容到列表
with open(file_path_3, "r") as file:
    jfleg_lst_wsc = [line.strip() for line in file.readlines()]

with open(file_path_4, "r") as file:
    jfleg_lst_wsc.extend([line.strip() for line in file.readlines()])

In [6]:
print(len(jfleg_lst))
print(len(jfleg_lst_wsc))

3202
3202


In [7]:
flu_lst=[]
tox_lst=[]
entropy_lst=[]
ppl_lst=[]
ss_lst=[]
semantic_entropy_lst=[]

for i in tqdm(range(len(jfleg_lst))):

    ori_text = jfleg_lst[i]
    cor_text = jfleg_lst_wsc[i]

    flu_lst.append(analyze_fluency(cor_text)-analyze_fluency(ori_text))
    tox_lst.append(analyze_toxicity(cor_text)-analyze_toxicity(ori_text))
    entropy_lst.append(calculate_entropy(cor_text)-calculate_entropy(ori_text))
    ppl_lst.append(calculate_perplexity(cor_text)-calculate_perplexity(ori_text))
    ss_lst.append(calculate_semantic_similarity(ori_text,cor_text))
    semantic_entropy_lst.append(predictive_entropy(model.get_token_log_likelihoods(cor_text)) -
                                 predictive_entropy(model.get_token_log_likelihoods(ori_text)))

# 创建一个包含这四个列表的字典
data = {
    'ori_text':jfleg_lst,
    'cor_text':jfleg_lst_wsc,
    'fluency': flu_lst,
    'toxicity': tox_lst,
    'Shannon entropy':entropy_lst,
    'perplexity': ppl_lst,
    'semantic entropy':semantic_entropy_lst,
    'semantic similarity':ss_lst
}

# 将字典转换为DataFrame
df = pd.DataFrame(data)

  0%|          | 0/3202 [00:00<?, ?it/s]

  'input_ids': torch.tensor(batch_dict.get('input_ids').to(batch_dict.get('input_ids')).long()),
  self.gen = func(*args, **kwds)
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [None]:
df.head()

Unnamed: 0,ori_text,cor_text,fluency,toxicity,shannon_entropy,perplexity,semantic_entropy,semantic_similarity
0,So I think we would not be live if our ancesto...,So I think we would not belive if our ancestor...,0.302740,5.091497e-07,0.014341,0.079090,0.540797,0.858357
1,Imagine yourself you are working in factory ju...,Imagine yourself you are working in factory ju...,-0.004786,-4.645365e-04,0.062624,0.463773,0.044193,0.980495
2,"For example , they can play football whenever ...","For example, they can play football whenever t...",0.002559,-8.590570e-06,0.202850,0.941190,0.668214,0.877560
3,While It is true that consumers preffer to buy...,While It is true that consumers preffer to buy...,0.000000,4.936774e-07,0.083739,0.691510,0.138582,0.980103
4,And young people spend more time on ther lifes...,And young people spend more time on ther lifes...,0.000000,4.362009e-07,0.090736,0.389478,0.167218,0.959664
...,...,...,...,...,...,...,...,...
3197,"The person takes the bike , goes where he wish...","The person takes the bike, goes where he wishe...",0.000000,2.817902e-06,0.108515,0.582923,0.483987,0.925970
3198,And I am going to another country .,And I am going to another country.,0.000000,-2.797158e-05,0.103703,0.394363,0.807210,0.978307
3199,The youth today are aware of their responsibil...,The youth today are aware of their responsibil...,0.000000,-3.850218e-07,0.085420,0.385776,0.110892,0.966446
3200,But I disagree with this opinion because often...,But I disagree with this opinion because often...,0.247741,8.938405e-07,0.017884,0.131764,0.364977,0.957427


In [9]:
# 保存DataFrame到Excel文件
excel_file_path = 'jfleg_data_all.xlsx'  # 输出的Excel文件路径
df.to_excel(excel_file_path, index=False)

In [None]:
# df = pd.read_excel("jfleg_data_all.xlsx")
# df = df.rename(columns={'shannon_entropy': 'Shannon entropy','semantic_similarity':'semantic similarity','semantic_entropy':'semantic entropy'})
# excel_file_path = 'jfleg_data_all.xlsx'  # 输出的Excel文件路径
# df.to_excel(excel_file_path, index=False)