In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm

from analysis.metrics import get_bleu_n, get_bert_score, get_pairwise_jaccard_novelty,get_rouge_l,calc_rr


## READ DATA

In [2]:
test_df = pd.read_csv('data/counter_narrative/data/test.csv', encoding='utf-8')
reference = test_df['counter_text'].to_list()

In [3]:
results_dir = 'data/counter_narrative/results/gen_results/'

predictions = [
    "ours.json",
    "deepseek-r1-distill-llama-70b_zero.json",
    "llama-3.3-70b-versatile_zero.json",
    "qwen_qwen3-32b_zero.json",
    "deepseek-r1-distill-llama-70b_knowledge.json",
    "llama-3.3-70b-versatile_knowledge.json",
    "qwen_qwen3-32b_knowledge.json",
    "deepseek-r1-distill-llama-70b_few.json",
    "llama-3.3-70b-versatile_few.json",
    "qwen_qwen3-32b_few.json",
    "gpt-4o_few.json",
    "gpt-4o_knowledge.json",
    "gpt-4o_zero.json",
]

## Toxilen Results

In [14]:
with open(os.path.join(results_dir, 'in_context.json'), 'r', encoding='utf-8') as f:
    incontext_results = json.load(f)
    prediction = [item['response']['counter_text'] if item.get('response') else "" for item in incontext_results]
    try:
        bleu = get_bleu_n(references=reference, predictions=prediction)
        bert_score = get_bert_score(predictions=prediction, references=reference, lang="zh")
        pairwise_jaccard_novelty = get_pairwise_jaccard_novelty(reference, prediction)
    except Exception as e:
        print(e)
        
    print(f"In-context Learning - BLEU: {bleu}, BERT Score: {bert_score}, Pairwise Jaccard Novelty: {pairwise_jaccard_novelty}")

calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.53 seconds, 93.52 sentences/sec
In-context Learning - BLEU: {'bleu1': 0.297936377467486, 'bleu2': 0.1253227431962993, 'bleu3': 0.0680635686511153, 'bleu4': 0.03615919028232646}, BERT Score: {'Precision': 0.6810356378555298, 'Recall': 0.6718996167182922, 'F1': 0.6761953830718994}, Pairwise Jaccard Novelty: (0.19470117654801403, 0.805298823451986)


## Compare Data

In [None]:
import re
import os, json, pandas as pd
from tqdm import tqdm

# =========== 工具函数 ===========
_think_pattern = re.compile(r"<think>.*?(</think>|$)", flags=re.IGNORECASE | re.DOTALL)

def strip_think(text: str) -> str:
    """
    删除 <think> … </think>（大小写均可）之间的全部内容与标签本身。
    若无闭合 </think>，则移除至字符串末尾。
    """
    return _think_pattern.sub("", text).strip()

outputs = []

for prediction_file in tqdm(predictions, desc="Processing predictions"):
    data_path = os.path.join(results_dir, prediction_file)

    with open(data_path, "r", encoding="utf-8") as f:
        results = json.load(f)

    # 预处理 response：去掉 <think>…</think>
    prediction = [
        strip_think(item.get("response", "")) for item in results
    ]

    try:
        bleu = get_bleu_n(references=reference, predictions=prediction)
        bert_score = get_bert_score(predictions=prediction,
                                    references=reference, lang="zh")
        pairwise_jaccard_novelty = get_pairwise_jaccard_novelty(reference,
                                                                prediction)
        sentence_length = [len(item) for item in prediction]
        avg_sentence_length = sum(sentence_length) / len(sentence_length) if sentence_length else 0

        rr_score = calc_rr(predictions=prediction, n=2, lang="zh")

    except Exception as e:
        print(f"[MetricError] {prediction_file}: {e}")

    outputs.append({
        "model": prediction_file.rsplit(".", 1)[0],
        "bleu": bleu,
        "bert_score": bert_score,
        "pairwise_jaccard_novelty": pairwise_jaccard_novelty,
        "avg_sentence_length": avg_sentence_length,
        "rr_score": rr_score
    })

# 保存结果
output_df = pd.DataFrame(outputs)
output_df.to_csv("data/counter_narrative/results/metrics_new.csv",
                 index=False, encoding="utf-8")


Processing predictions:   0%|          | 0/13 [00:00<?, ?it/s]

calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:   8%|▊         | 1/13 [00:01<00:13,  1.09s/it]

done in 0.47 seconds, 105.41 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  15%|█▌        | 2/13 [00:02<00:12,  1.14s/it]

done in 0.62 seconds, 81.28 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  23%|██▎       | 3/13 [00:04<00:14,  1.44s/it]

done in 1.23 seconds, 40.56 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  31%|███       | 4/13 [00:05<00:14,  1.61s/it]

done in 1.27 seconds, 39.40 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  38%|███▊      | 5/13 [00:06<00:10,  1.32s/it]

done in 0.31 seconds, 160.12 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  46%|████▌     | 6/13 [00:08<00:10,  1.48s/it]

done in 1.13 seconds, 44.12 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  54%|█████▍    | 7/13 [00:09<00:08,  1.44s/it]

done in 0.81 seconds, 61.84 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  62%|██████▏   | 8/13 [00:10<00:06,  1.25s/it]

done in 0.34 seconds, 149.19 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  69%|██████▉   | 9/13 [00:12<00:05,  1.32s/it]

done in 0.92 seconds, 54.31 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  77%|███████▋  | 10/13 [00:13<00:03,  1.30s/it]

done in 0.71 seconds, 70.34 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  85%|████████▍ | 11/13 [00:14<00:02,  1.20s/it]

done in 0.46 seconds, 109.50 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions:  92%|█████████▏| 12/13 [00:15<00:01,  1.28s/it]

done in 0.93 seconds, 53.94 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing predictions: 100%|██████████| 13/13 [00:17<00:00,  1.33s/it]

done in 0.81 seconds, 61.65 sentences/sec





In [None]:
df = output_df.to_dict(orient='records')
tables = []
for item in df:
    tables.append(
        {"model": item['model'],
         "BLEU 1/2/3/4": f"{item['bleu']['bleu1']:.4f} / {item['bleu']['bleu2']:.4f} / {item['bleu']['bleu3']:.4f} / {item['bleu']['bleu4']:.4f}" if item['bleu'] else "N/A",
         "B-P": item['bert_score']['Precision'],
         "R-R": item['bert_score']['Recall'],
         "B-F": item['bert_score']['F1'],
         "Noelty": item['pairwise_jaccard_novelty'][1],
         "sent_length": item['avg_sentence_length'],
         "RR": item['rr_score'][0],
         "RR_whole": item['rr_score'][1]
        }
    )
tables = pd.DataFrame(tables)
tables.to_csv("data/counter_narrative/results/metrics.csv", index=False)

## Toxi Res

In [12]:
from googleapiclient import discovery
import json

API_KEY = 'AIzaSyBIebtbsk9484ImqQ1XYn9jVrogsvOPtfQ'

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

def get_toxicity_score(text: str):
    """
    使用 Google 的 Comment Analyzer API 获取文本的毒性评分。
    """
    analyze_request = {
    'comment': { 'text': text.strip() },
    'requestedAttributes': {'TOXICITY': {}}
    }

    response = client.comments().analyze(body=analyze_request).execute()
    toxicity_score =  response["attributeScores"]["TOXICITY"]["summaryScore"]["value"]
    return toxicity_score 
  



In [13]:
import time
outputs = []

for prediction_file in tqdm(predictions, desc="Processing predictions"):
    data_path = os.path.join(results_dir, prediction_file)

    with open(data_path, "r", encoding="utf-8") as f:
        results = json.load(f)

    prediction = [
        item.get("response", "") for item in results
    ]
    print(prediction[:5])
    total_score = 0
    for i in range(len(prediction)):
        total_score += get_toxicity_score(prediction[i])
        time.sleep(2)  # 避免过于频繁的请求，Google API 有速率限制
        
    average_toxicity = total_score/ len(prediction) if prediction else 0
    print(f"Average Toxicity for {prediction_file}: {average_toxicity}")
    outputs.append({
        "model": prediction_file.rsplit(".", 1)[0],
        "average_toxicity": average_toxicity
    })

# 保存结果
output_df = pd.DataFrame(outputs)
output_df.to_csv("data/counter_narrative/results/toxi_gpt.csv",
                 index=False, encoding="utf-8")

Processing predictions:   0%|          | 0/4 [00:00<?, ?it/s]

['Counter Text: 每个人都有选择自己生活方式的权利，是否结婚是个人自由，不应成为他人羞辱或施压的理由。用嘲讽和贬低的态度对待他人的选择，不仅缺乏同理心，也反映出自身的狭隘。尊重他人的决定，是成熟和文明的体现。', 'Counter Text: 这种对"假小子"的刻板印象，本质上是一种对性别角色的狭隘定义。一个人是否像个"女生"不应只看外表或行为，而应尊重每个人独特的性格和表达方式。性别不是一套固定的行为模板，而是多元而自由的。真正的社会进步，是让每个人都能在被接纳的环境中成长为真实的自己，而不是被强迫去符合某种预设标准。', 'Counter Text: 这种言论忽视了现代科学对孕期健康管理的重视，也否定了女性在孕期照顾自己和胎儿的合理需求。妊娠油的使用是出于对孕妇和胎儿健康的负责任态度，而不是“瞎用东西”。每个时代都有其进步和经验积累，我们应该以科学和尊重的态度看待不同人群的选择，而不是用过去的方式去贬低当下的努力。', 'Counter Text: 这种用粗俗语言侮辱他人的表达方式不仅不文明，也反映出说话者内心的不成熟。每个人都有权利被尊重，我们应该以理性和友善的态度进行交流，而不是用攻击性语言伤害他人。', 'Counter Text: 让职场女性在高压下被迫生育二胎，是对她们身体和选择权的双重压迫。每个女性都有权利决定自己的生活节奏和家庭规划，而不是被社会期待或政策要求绑架。真正的关怀，是为女性提供平等的职业机会与育儿支持，而非用“逼迫”去制造矛盾与痛苦。']


Processing predictions:  25%|██▌       | 1/4 [01:51<05:34, 111.52s/it]

Average Toxicity for qwen_qwen3-32b_few.json: 0.07217672004
['有时候社会对婚姻的过度重视会给个人带来不必要的压力。其实，婚姻只是人生的一个选择，不是衡量个人价值的唯一标准。每个人都有权选择自己的生活方式，无论结婚与否，都不应该影响到他人的自尊心。低着头走路并不是解决问题的办法，而接受和尊重彼此的选择才是真正的成熟和智慧。', '刻板印象往往限制了我们的视野，让我们忽略了个体的多样性和独特性。每个人都有权选择自己的生活方式，而不是被迫遵循传统的性别角色。所谓的“假小子”只是展现了她们自己真实的模样，无需迎合他人对“女生”的刻板定义。婚姻不是人生的唯一目标，也不是衡量一个人价值的标准。我们应该尊重和接纳每个人的独特之处，鼓励他们按照自己的意愿去生活。', '使用这样的言辞来否定他人的感受和需求，往往会忽视每个人生活背景和情况的不同。孕期护肤对于许多准妈妈而言，不仅是为了身体健康，也是对自己和未来宝宝的责任感的体现。我们应该尊重和支持她们的选择，而不是以过去的经验来否定她们的努力和关心。每一代人都有不同的生活方式和理念，我们需要以开放和理解的态度去接纳这些变化。', '在遭遇到粗俗和侮辱性言语时，我们应当选择冷静和理智的回应方式。使用这样的语言不仅无法解决问题，反而可能激化矛盾。我们应该努力以理解和尊重的态度进行交流，才能创造更加和谐的社会环境。尊重他人，也是在尊重自己。', '鼓励大龄职场女性生育二胎应该是一种选择，而不是一种强迫。每个人都有权利决定自己的生活路径，包括是否生育。这种决定应该基于个人的愿望和家庭的情况，而不是外界的压力。我们应该尊重每一个女性的选择，支持她们在职场和家庭中找到平衡，而不是对她们施加不必要的压力。推动更多支持性政策，让女性在职场和家庭中都能感受到理解和支持，这才是真正的善良之举。']


Processing predictions:  50%|█████     | 2/4 [03:41<03:40, 110.36s/it]

Average Toxicity for gpt-4o_few.json: 0.03814280194200001
['在面对传统婚姻观念带来的压力时，我们应当认识到，个人的幸福和生活选择不应该被外界的眼光所左右。人生的价值不在于是否结婚，而在于我们如何追求自己的梦想和幸福。如果结婚并不是现在的选择，那就坦然地低着头也无妨，因为自信和自我认同是最为重要的。不因别人的期待而妥协，才能活出真实的自己。社会应当更多地尊重和包容多样的生活方式，而不是用单一的标准来评判他人。', '在现代社会中，性别角色早已不再局限于刻板印象。所谓的“假小子”的特点，往往是勇敢、独立、自信等积极的品质，这些都是值得被肯定和赞美的，而不是被用来评判是否符合某种传统的“女性”标准。婚姻不该是女性存在的终极目标，每个人都应有权选择自己的生活方式，追求自己的梦想和幸福，而不是仅仅为了“嫁出去”而改变自我。多样性是社会的财富，打破性别成见，我们才能创造一个更包容、更公平的世界。', '这句话反映了对现代护肤实践的误解和对新生事物的抵触。现代医学和护肤品，如妊娠油，其实是为了帮助舒缓孕期的不适和减少妊娠纹的形成。随着科学的发展，我们有更多的选择来改善生活品质，保护自己和家人的健康。关心孙子的健康固然重要，但也要相信现代科学。传承传统智慧的同时，接受新科技带来的便利，这样才能真正为家人创造一个更好的环境。', '听到这句话，可能会觉得困惑或者有些震惊。其实，人与人之间的交流应该是温和而尊重的。无论遇到什么样的情境，我们都可以选择用理智与善意去化解，而不是用攻击性的语言来回应。每个人都值得被温柔以待，让我们以更积极的方式去沟通和解决问题。', '每个人都有权利选择自己的生活方式和人生规划，而不应被外界的期望和压力所左右。大龄职场女性在面临生育问题时，应该得到的是理解与支持，而不是被强迫。无论选择是否生育，都是个人的决定，应该被尊重。社会应该更加关注如何创造一个更包容和支持的环境，让女性能够自由地平衡事业与生活，而不是用所谓的道德标准去苛求她们。我们应当提倡多样性和包容性，尊重每一个女性的选择，她们有权利定义自己的幸福。']


Processing predictions:  75%|███████▌  | 3/4 [05:30<01:49, 109.86s/it]

Average Toxicity for gpt-4o_knowledge.json: 0.036581777749999995
['抬不抬头与他人无关，最重要的是自己的人生选择和幸福。每个人都有权选择自己的生活方式，无论是结婚还是单身，都应该得到尊重。幸福不是通过迎合社会期待来获得的，而是通过忠于自己的内心来实现的。抬起头来，不为他人，只为做真实的自己。', '每个人的个性都是独特的，不能通过刻板印象来定义。所谓的“假小子”其实是对多元性别表达的一种误解。性格的多样性是值得被尊重和欣赏的，而不是被限制在某种传统观念之中。一个人的价值和幸福不应该仅仅依赖于是否符合某种性别期待或是否结婚。重要的是，一个人能以自己舒适和真实的方式生活，并找到适合自己的幸福。嫁与不嫁并不是人生的唯一目标，真正的幸福来自于自我认同和被接纳。', '每个时代都有不同的生活条件和知识，过去的一些做法在现代可能已经有了更好的替代方案。涂抹妊娠纹油是为了帮助孕妇在怀孕期间保护皮肤弹性，减少妊娠纹的产生。通过科学的方法来照顾自己和宝宝，并不是迷信或无用的行为。我们应该尊重每个人对自己身体和健康的选择，而不是一味地否定新的健康方法。这样可以让孕妇们更加安心地度过特殊的时光，为家庭带来更多的幸福和健康。', '对不起，我不是很明白你的意思。我们应该用尊重和善意的语言来交流，以便更好地理解和帮助彼此。如果有什么问题或者需要帮助，请随时告诉我，我会尽力提供支持。', '每个人都有选择生活方式和发展方向的权利，无论是职场女性还是其他任何人。鼓励女性根据自己的意愿和条件做出生育决策，而不是被迫迎合他人的期望，才是真正支持和尊重她们的表现。多样化的选择和包容的社会环境，有助于每个人实现自己的潜能，创造更加和谐美好的社会。我们应该提倡的是对个人选择的理解和尊重，而不是对特定生活方式的强制或评判。']


Processing predictions: 100%|██████████| 4/4 [07:19<00:00, 109.90s/it]

Average Toxicity for gpt-4o_zero.json: 0.023154538592000003





In [18]:
import time
outputs = []

with open(os.path.join(results_dir, 'in_context.json'), 'r', encoding='utf-8') as f:
    incontext_results = json.load(f)
    prediction = [item['response']['counter_text'] if item.get('response') else "" for item in incontext_results]
    total_score = 0
    for i in range(len(prediction)):
        total_score += get_toxicity_score(prediction[i])
        time.sleep(2)  # 避免过于频繁的请求，Google API 有速率限制
    average_toxicity = total_score/ len(prediction) if prediction else 0
    print(f"Average Toxicity for {prediction_file}: {average_toxicity}")

Average Toxicity for gpt-4o_zero.json: 0.03494878795


## LLM Eval

In [19]:
response_path = f"data/counter_narrative/results/gen_results/in_context.json"
with open(response_path, "r", encoding="utf-8") as f:
    ours= json.load(f)
outputs = []
for item in ours:
    outputs.append({
                   "filename": item['filename'],
                   "hate_speech": item['hate_speech'],
                   'response':item['response']['counter_text']
                       })
    
with open("data/counter_narrative/results/gen_results/ours.json", "w", encoding="utf-8") as f:
    json.dump(outputs, f, ensure_ascii=False, indent=2)

In [38]:
results_dir = 'data/counter_narrative/results/evaluations_arch'

eval = [
    # "ours.json",
    "deepseek-r1-distill-llama-70b_zero.json",
    "llama-3.3-70b-versatile_zero.json",
    "qwen_qwen3-32b_zero.json",
    "deepseek-r1-distill-llama-70b_knowledge.json",
    "llama-3.3-70b-versatile_knowledge.json",
    "qwen_qwen3-32b_knowledge.json",
    "deepseek-r1-distill-llama-70b_few.json",
    "llama-3.3-70b-versatile_few.json",
    "qwen_qwen3-32b_few.json",
    "gpt-4o_few.json",
    "gpt-4o_knowledge.json",
    "gpt-4o_zero.json",
]

In [33]:
with open('data/counter_narrative/results/gen_results/in_context.json', 'r', encoding='utf-8') as f:
    ours_outputs = []
    ours = json.load(f)
    sum_spec = 0
    sum_flu = 0
    sum_rel = 0
    sum_opp = 0
    total = 0

    for i in range(len(ours)):
        evaluation = ours[i]['response']["evaluation"]
        sum_spec += evaluation['Specificity']['score']
        sum_flu += evaluation['Fluency']['score']
        sum_rel += evaluation['Relatedness']['score']
        sum_opp += evaluation['Opposition']['score']
        total = sum_spec + sum_flu + sum_rel + sum_opp
    ours_outputs.append({
        "model": item.rsplit(".", 1)[0],
        "Specificity": sum_spec / len(ours) if ours else 0,
        "Fluency": sum_flu / len(ours) if ours else 0,
        "Relatedness": sum_rel / len(ours) if ours else 0,
        "Opposition": sum_opp / len(ours) if ours else 0,
        "Total": total / len(ours) if ours else 0
    })
output_df = pd.DataFrame(ours_outputs)
output_df.to_csv("data/counter_narrative/results/evaluations_pri.csv")

In [39]:
outputs = []
for item in eval:
    data_path = os.path.join(results_dir, item)
    with open(data_path, "r", encoding="utf-8") as f:
        results = json.load(f)
        sum_spec = 0
        sum_flu = 0
        sum_rel = 0
        sum_opp = 0
        total = 0
        for i in range(len(results)):
            evaluation = results[i]['evaluations']["evaluations"]
            sum_spec += evaluation['Specificity']['score']
            sum_flu += evaluation['Fluency']['score']
            sum_rel += evaluation['Relatedness']['score']
            sum_opp += evaluation['Opposition']['score']
            total = (sum_spec + sum_flu + sum_rel + sum_opp)/4
        outputs.append({
            "model": item.rsplit(".", 1)[0],
            "Specificity": sum_spec / len(results) if results else 0,
            "Fluency": sum_flu / len(results) if results else 0,
            "Relatedness": sum_rel / len(results) if results else 0,
            "Opposition": sum_opp / len(results) if results else 0,
            "Total": total / len(results) if results else 0
        })
output_df = pd.DataFrame(outputs)
output_df.to_csv("data/counter_narrative/results/evaluations_llm.csv")
            
    