In [None]:
!pip install pandas openpyxl requests openai langchain indic-nlp-library

This metric uses Google's Perspective API to calculate toxicity scores for hate speech and their corresponding counter-narratives across multiple datasets. It appends the scores as new columns, enabling quantitative comparison of toxicity reduction achieved by different approaches.

In [None]:
import pandas as pd
import requests
import time
import os

API_KEY = "<PERSPECTIVE_API>"
API_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"

def get_toxicity_score(text):
    headers = {'Content-Type': 'application/json'}
    data = {
        'comment': {'text': text},
        'languages': ['en'],
        'requestedAttributes': {'TOXICITY': {}}
    }
    try:
        response = requests.post(API_URL, json=data, headers=headers)
        if response.status_code == 200:
            result = response.json()
            return result['attributeScores']['TOXICITY']['summaryScore']['value']
        else:
            print(f"API error: {response.status_code}, {response.text}")
            return None
    except Exception as e:
        print(f"Exception: {e}")
        return None

def process_excel_file(file_path):
    df = pd.read_excel(file_path)

    if 'Hate speech' not in df.columns or 'fact_counter_narratives' not in df.columns:
        print(f"Required columns not found in {file_path}")
        return None

    # Add toxicity score columns
    hs_toxicities = []
    cn_toxicities = []

    for i, row in df.iterrows():
        hs = str(row['Hate speech'])
        cn = str(row['fact_counter_narratives'])

        hs_score = get_toxicity_score(hs)
        cn_score = get_toxicity_score(cn)

        hs_toxicities.append(hs_score)
        cn_toxicities.append(cn_score)

        print(f"{file_path} | Row {i+1} | CN: {cn_score:.3f}")
        time.sleep(1)

    df['HS_Toxicity'] = hs_toxicities
    df['CN_Toxicity'] = cn_toxicities

    return df

dataset_files = ["<datasets_from_all_approaches>"]

# Process and save each with new toxicity columns
for file in dataset_files:
    if os.path.exists(file):
        updated_df = process_excel_file(file)
        updated_df = updated_df.dropna(subset=["CN_Toxicity"])
        avg_hs_toxicity = updated_df["HS_Toxicity"].mean()
        avg_cn_toxicity = updated_df["CN_Toxicity"].mean()
        print(f"Average HS Toxicity: {avg_hs_toxicity:.3f}")
        print(f"Average CN Toxicity: {avg_cn_toxicity:.3f}")
    else:
        print(f"File not found: {file}")


This metric uses GPT-based evaluation to automatically score counter-narratives on Factuality, Persuasiveness, Informativeness, and Linguistic Quality. The results are appended to the dataset and saved, providing structured metrics for comparative analysis.

In [None]:
import openai
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import json

# Set your OpenAI API key
api_key = ""
client = OpenAI(api_key=api_key)

eval_prompt_template = """
நீங்கள் ஒரு மொழி மதிப்பீட்டுக் கருவியாக செயல்படுகிறீர்கள்.
தரவுத்தொகுப்பில் உள்ள ஒவ்வொரு எடுத்துக்காட்டையும் பின்வரும் கூறுகளின் அடிப்படையில் 0 முதல் 1 வரை மதிப்பீடு செய்யவும்:

உண்மைத்தன்மை (Factuality): எதிர்பேச்சு, வழங்கப்பட்ட 'context' தகவல்களுக்கு ஏற்பவே உள்ளதா என்பதை மதிப்பீடு செய்யவும்.
நம்ப வைக்கும் திறன் (Persuasiveness): இந்த எதிர்பேச்சு, வாசகரின் கருத்தை மாற்ற வைக்கும் வகையில் நம்ப வைக்கும் தன்மை உள்ளதா?
தகவலளிப்பு (Informativeness): எதிர்பேச்சு எந்த அளவுக்கு பயனுள்ள, தகவலளிக்கும் தன்மை கொண்டது?
மொழித்திறன் (Linguistic Quality): எதிர்பேச்சு, இலக்கண ரீதியாக தெளிவாகவும் ஒழுங்காகவும் உள்ளதா?

தகவல் (Context):
{context}

வெறுப்புப் பேச்சு (Hate Speech):
{hs}

எதிர்பேச்சு (Counter Narrative):
{cn}

மதிப்பீட்டு முடிவுகளை கீழே உள்ள JSON வடிவத்தில் மட்டும் அளிக்கவும்:
{{
"Factuality": <score>,
"Persuasiveness": <score>,
"Informativeness": <score>,
"Linguistic Quality": <score>
}}
"""

def evaluate_cn(hs, cn, context):
    prompt = eval_prompt_template.format(context=context, hs=hs, cn=cn)

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=[{"role": "user", "content": prompt}]
    )

    try:
        content = response.choices[0].message.content.strip()
        return json.loads(content)
    except Exception as e:
        print(f"Error parsing response: {e}")
        return {"Factuality": None, "Persuasiveness": None, "Informativeness": None, "Linguistic Quality": None}

df = pd.read_excel("<dataset>.xlsx")

# Storage for scores
scores = []

# Loop through dataset rows
for _, row in tqdm(df.iterrows(), total=len(df)):
    context = row["Context"]
    hs = row["Hate speech"]
    cn = row["fact_counter_narratives"]
    score = evaluate_cn(hs, cn, context)
    scores.append(score)

df_scores = pd.DataFrame(scores)
df_final = pd.concat([df, df_scores], axis=1)

df_final.to_excel("metrics.xlsx", index=False)


This metric computes BLEU-1 to BLEU-4 scores to evaluate counter-narratives generated by LLMs against human-authored references.

In [None]:
import pandas as pd
import random
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from indicnlp.tokenize import indic_tokenize

def tokenize_tamil(text):
    return indic_tokenize.trivial_tokenize(text.strip(), lang='ta')

def compute_bleu(path, ref_column='Counter Narrative from dataset', cand_column='Counter Narrative from LLM'):
    df = pd.read_excel(path)

    refs = df[ref_column].dropna().astype(str).tolist()
    cands = df[cand_column].dropna().astype(str).tolist()

    # Tokenize the references and candidates
    references = [[ref.split()] for ref in refs]
    candidates = [cand.split() for cand in cands]

    # Define weights for BLEU-1 to BLEU-4
    weights_map = {
        1: (1.0,),
        2: (0.5, 0.5),
        3: (1/3, 1/3, 1/3),
        4: (0.25, 0.25, 0.25, 0.25)
    }

    smoothing = SmoothingFunction().method1
    results = {}

    # Calculate BLEU-1 to BLEU-4
    for n in range(1, 5):
        weights = weights_map[n]
        score = corpus_bleu(references, candidates, weights=weights, smoothing_function=smoothing)
        results[f"BLEU-{n}"] = round(score, 4)

    return results

In [None]:
bleu = compute_bleu(path='<dataset>.xlsx')
print(bleu)

This metric computes cosine similarity scores between human-authored and LLM-generated counter-narratives using Sentence-BERT embeddings. It reports mean and maximum similarity, providing a semantic measure of closeness between references and generated texts.

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_similarity_scores(
    file_path: str,
    reference_column: str = 'Counter Narrative from dataset',
    candidate_column: str = 'Counter Narrative from LLM',
    model_name: str = 'distiluse-base-multilingual-cased-v2'
) -> dict:
    """
    Computes cosine similarity between reference and candidate counter narratives
    using Sentence-BERT embeddings.

    Parameters:
    - file_path (str): Path to the Excel file.
    - reference_column (str): Column name for reference counter narratives.
    - candidate_column (str): Column name for generated counter narratives.
    - model_name (str): sBERT model name to load.

    Returns:
    - dict with aggregated max and mean similarity scores.
    """
    # Load Sentence-BERT model
    model = SentenceTransformer(model_name)

    df = pd.read_excel(file_path)

    df = df[[reference_column, candidate_column]].dropna()

    # Extract CNs
    reference_texts = df[reference_column].tolist()
    candidate_texts = df[candidate_column].tolist()

    # Encode both lists
    ref_embeddings = model.encode(reference_texts, convert_to_tensor=True)
    cand_embeddings = model.encode(candidate_texts, convert_to_tensor=True)

    # Compute cosine similarity row-wise (pairwise, not full matrix)
    similarities = [
        cosine_similarity([cand_emb], [ref_emb])[0][0]
        for cand_emb, ref_emb in zip(cand_embeddings, ref_embeddings)
    ]

    similarities = np.array(similarities)

    # Compute statistics
    mean_similarity = np.mean(similarities)
    max_similarity = np.max(similarities)

    print(f"Mean Cosine Similarity: {mean_similarity:.4f}")
    print(f"Max Cosine Similarity: {max_similarity:.4f}")

    return {
        "mean_similarity": float(mean_similarity),
        "max_similarity": float(max_similarity)
    }


In [None]:
scores = compute_similarity_scores("<dataset>.xlsx")
print(scores)

This graph calculates pairwise percentage agreement between human annotators and the LLM on linguistic quality ratings, and visualizes the results as a heatmap. The plot highlights consistency levels across evaluators.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_excel("metrics.xlsx")

# Only keep the relevant columns
cols = ["LQ_Human1", "LQ_Human2", "LQ_Human3", "Linguistic_Quality_LLM"]
df = df[cols]

df = df.applymap(lambda x: str(x).strip().lower())

# Compute percentage agreement matrix
agreement = pd.DataFrame(index=cols, columns=cols, dtype=float)
for c1 in cols:
    for c2 in cols:
        agreement.loc[c1, c2] = (df[c1] == df[c2]).mean() * 100

# Plot heatmap
plt.figure(figsize=(6,5))
sns.heatmap(agreement, annot=True, fmt=".1f", cmap="YlGnBu", vmin=75, vmax=100)
plt.title("Linguistic Quality Agreement (%)", fontsize=14)
plt.show()

This plots linguistic quality scores from three human annotators and the LLM across prompts, allowing a visual comparison of rating trends and alignment. The line graph highlights variations and consistency in evaluations.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_excel("metrics-fact_counter_narratives-1-inaia-kaaval.xlsx")

cols = ["LQ_Human1", "LQ_Human2", "LQ_Human3", "Linguistic_Quality_LLM"]
df = df[cols]
df['Prompt'] = range(1, len(df) + 1)

plt.figure(figsize=(10, 5))

plt.plot(df['Prompt'], df['LQ_Human1'], marker='o', linestyle='-', color='green', label='Human1')
plt.plot(df['Prompt'], df['LQ_Human2'], marker='o', linestyle='-', color='blue', label='Human2')
plt.plot(df['Prompt'], df['LQ_Human3'], marker='o', linestyle='-', color='purple', label='Human3')
plt.plot(df['Prompt'], df['Linguistic_Quality_LLM'], marker='o', linestyle='-', color='red', label='LLM')

# Formatting
plt.xticks(df['Prompt'])
plt.ylim(-0.1, 1.1)
plt.xlabel('Prompt Number')
plt.ylabel('Score')
plt.title('Linguistic Quality Scores')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)

plt.show()
