In [16]:
import pandas as pd
from tqdm import tqdm
from utils import extract_score, extract_content_from_response, postprocess_llm_response
from generate import predict

tqdm.pandas()

In [17]:
# prompt templates

common_input_prompt = lambda hs, cs: f"""
Hate Speech: "{hs}"\nCounterspeech: "{cs}"
""".strip()

instruction_prompt = f"""
You will be given one counterspeech (also called as counter-argument or counter-narrative) written for a hateful statement (hate speech). Your
task is to rate the counterspeech on one metric. Please make sure you read and understand these instructions carefully.
""".strip()

evaluation_criteria_relevance = f"""
Contextual Relevance (1-5) - This dimension evaluates whether the counterspeech addresses the central topic, theme subject of the given hate
speech.
""".strip()

evaluation_criteria_aggressiveness = f"""
Aggressiveness (1-5) - This metric assesses the degree of confrontational or inflammatory content in the counterspeech. It considers factors such
as abusive language, intensity of disagreement, tone, engagement style, and the presence of personal attacks, with higher scores indicating
greater aggression.
""".strip()

evaluation_criteria_coherence = f"""
Argument Coherence (1-5) - This metric assesses how logically and smoothly the ideas or arguments within the counterspeech connect and
flow. A coherent counterspeech will present its arguments in an organized manner, making it easy for the reader to follow and understand the
counter-narrative being presented.
""".strip()

evaluation_criteria_suitableness = f"""
Suitableness (1-3) - This metric measures the likelihood of an annotator choosing a given counterspeech for direct use (without editing) in a
real scenario against online hate speech. This assessment considers the counterspeech's suitability, appropriateness, and potential impact on a
reader in a real-world context.
""".strip()

# ----------------------------------------------------------------------- #

relevance_score_prompt_zs = f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_relevance}

Ensure that the response is STRICTLY in JSON format as {{"Relevance": ""}}
""".strip()

coherence_score_prompt_zs = f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_coherence}

Ensure that the response is STRICTLY in JSON format as {{"Argument Coherence": ""}}.
""".strip()

suitableness_score_prompt_zs = f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_suitableness}

Ensure that the response is STRICTLY in JSON format as {{"Suitableness": ""}}.
""".strip()

aggressiveness_score_prompt_zs = f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_aggressiveness}

Ensure that the response is STRICTLY in JSON format as {{"Aggressiveness": ""}}.
""".strip()

# ----------------------------------------------------------------------- #

relevance_score_prompt_cot = lambda evaluation_steps : f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_relevance}

Evaluation Steps:
{evaluation_steps}

Ensure that the response is STRICTLY in JSON format as {{"Relevance": ""}}
""".strip()

coherence_score_prompt_cot = lambda evaluation_steps : f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_coherence}

Evaluation Steps:
{evaluation_steps}

Ensure that the response is STRICTLY in JSON format as {{"Argument Coherence": ""}}.
""".strip()

suitableness_score_prompt_cot = lambda evaluation_steps : f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_suitableness}

Evaluation Steps:
{evaluation_steps}

Ensure that the response is STRICTLY in JSON format as {{"Suitableness": ""}}.
""".strip()

aggressiveness_score_prompt_cot = lambda evaluation_steps : f"""
{instruction_prompt}

Evaluation Criteria:
{evaluation_criteria_aggressiveness}

Evaluation Steps:
{evaluation_steps}

Ensure that the response is STRICTLY in JSON format as {{"Aggressiveness": ""}}.
""".strip()

# ----------------------------------------------------------------------- #

cot_gen_few_shot_tempate = lambda aspect, hs1, cs1, r1, hs2, cs2, r2, hs3, cs3, r3: f"""
Hatespeech: {hs1}
Counterspeech: {cs1}
Expert Rating ({aspect}): {r1}

Hatespeech: {hs2}
Counterspeech: {cs2}
Expert Rating ({aspect}): {r2}

Hatespeech: {hs3}
Counterspeech: {cs3}
Expert Rating ({aspect}): {r3}
""".strip()

candidate_cot_drafting_template = lambda aspect, evaluation_criteria, few_shot_examples : f"""
You will be given sets of in-context examples, each containing a hate speech, a corresponding
counterspeech, and an {aspect} rated by a human expert. Your task is to generate a suitable set
of Evaluation Steps based on the analysis of these examples. Please make sure you read and
understand these instructions carefully.

## Examples:
{few_shot_examples}

## Instruction:
Analyze the examples below to identify patterns and factors that influence the {aspect} rating.
Then, create a detailed set of steps outlining how to evaluate the {aspect} of counterspeeches.

## Evaluation Criteria for {aspect}:
{evaluation_criteria}

""".strip()

# aspect = "Aggressiveness"
# few_shot_examples = cot_gen_few_shot_tempate(aspect,"hs1", "cs1", "r1", "hs2", "cs2", "r2", "hs3", "cs3", "r3")
# prompt = candidate_cot_drafting_template(aspect, evaluation_criteria_aggressiveness, few_shot_examples)
# print(prompt)

# ----------------------------------------------------------------------- #

cot_error_few_shot_tempate = lambda aspect, hs1, cs1, r1, p1, hs2, cs2, r2, p2, hs3, cs3, r3, p3: f"""
Hatespeech: {hs1}
Counterspeech: {cs1}
Actual Rating ({aspect}): {r1}
Predicted Rating ({aspect}): {p1}

Hatespeech: {hs2}
Counterspeech: {cs2}
Actual Rating ({aspect}): {r2}
Predicted Rating ({aspect}): {p2}

Hatespeech: {hs3}
Counterspeech: {cs3}
Actual Rating ({aspect}): {r3}
Predicted Rating ({aspect}): {p3}

""".strip()

candidate_cot_refinement_template = lambda aspect, candidate_cot, few_shot_examples: f"""
Please refine and improve the chain-of-thought (CoT) evaluation steps used by a large language model in evaluating {aspect} of counterspeech
generation.

Large language models (LLMs) are powerful neural models that can evaluate the quality of counterspeech generation. However, LLMs may not
always agree with human judgments. Please refine the CoT used by LLMs to improve its correlation with human expert scores. To refine the
scoring criteria used by the LLM in evaluating the {aspect}, please follow the following instructions step-by-step:
1. Carefully read each example, understand each hate speech and its corresponding counterspeech, and get your initial assessment of its
quality on {aspect}.
2. Compare the test score obtained by the LLM according to the CoT and the ground-truth score from human experts. Please think why the
correlation is limited by using the current CoT, and how can you improve the CoT to increase the correlation between LLM’s score and
human expert score. If there is a small gap or no gap, this means the CoT work well in this case.
3. Read all of the test cases and rethink how you could refine the current CoT based on your observations and analysis. Then, refine the
CoT to make it concise, accurate, and consistent with human judgments. When refining the CoT, you can do the following: 1)
modification: adjust some parts of the CoT to increase its correlation with the scoring CoT that you think might used by human experts;
2) paraphrase: if the CoT is good enough, you can consider paraphrasing it to make more concise and easy to understand; 3) adding
aspects or details: if you find some new underlying scoring rules not covered by the current CoT, consider adding them as a new line of
injecting to current CoT, but make sure not to make the CoT too long and redundant; 4) calibrate: you can take other methods you think
being helpful to improve the correlation with human experts.
Please return only your refined criteria without any additional sentences.

Old criteria:
{candidate_cot}

Error Examples:
{few_shot_examples}
""".strip()

# aspect = "Aggressiveness"
# candidate_cot = "Rate 1 as most aggressive and 5 as least aggressive"
# few_shot_examples = cot_error_few_shot_tempate(aspect, "hs1", "cs1", "r1", "p1", "hs2", "cs2", "r2", "p2", "hs3", "cs3", "r3", "p3")
# prompt = candidate_cot_refinement_template(aspect, candidate_cot, few_shot_examples)
# print(prompt)


In [18]:
df_dev = pd.read_csv('/home/amey/depository/cs-eval/data/annotations/dev_set_500.csv')

dev_set_gold_relevance_score = []
dev_set_gold_coherence_score = []
dev_set_gold_aggressiveness_score = []
dev_set_gold_suitableness_score = []

for i, row in df_dev.iterrows():
    dev_set_gold_relevance_score.append(
        (row['hatespeech'], row['predicted_counterspeech'], row['relevance_score'])
    )
    dev_set_gold_coherence_score.append(
        (row['hatespeech'], row['predicted_counterspeech'], row['coherence_score'])
    )
    dev_set_gold_aggressiveness_score.append(
        (row['hatespeech'], row['predicted_counterspeech'], row['aggressiveness_score'])
    )
    dev_set_gold_suitableness_score.append(
        (row['hatespeech'], row['predicted_counterspeech'], row['suitableness_score'])
    )


In [19]:
df_dev.head(3)

Unnamed: 0,hatespeech,counterspeech,predicted_counterspeech,uuid,csType,suitableness_score,relevance_score,coherence_score,aggressiveness_score
0,Jews are a burden on our society.,"Every individual, regardless of religion or et...",Your statement is not only offensive but also ...,a925f4b5-e3d1-4ef7-8bf8-8900849d266e,Denouncing,3.0,4.0,5.0,1.0
1,Immigrants are a threat to our culture.,But isn't it important to consider that the va...,How can you say that about a group of people w...,983a02f9-5517-4691-bd7b-a46e189ecc4d,Questioning,3.0,5.0,4.0,2.0
2,Muslim invasion is the biggest problem in Euro...,Discrimination and generalizations like these ...,"Diversity enriches societies, and embracing di...",6bf0b809-539f-4b60-9306-7d7ae65fa668,Denouncing,2.0,5.0,4.0,2.0


In [20]:
# def generate_candidate_cot(criteria, system_description=""):
#     prompt = criteria
#     return f"candidate cot {len(prompt)}"

# def generate_candidate_cot_self_refinement(refinement_prompt, system_description=""):
#     prompt = refinement_prompt
#     return f"refined candidate cot {len(prompt)}"



In [21]:
import random
from typing import List, Tuple

# Toy values for the golden set D*
# dev_set_gold = [
#     ("Article A1", "Summary S1", 5),
#     ("Article A2", "Summary S2", 4),
#     ("Article A3", "Summary S3", 3),
#     ("Article A4", "Summary S4", 2),
#     ("Article A5", "Summary S5", 1)
# ]

dev_set_gold = dev_set_gold_relevance_score
aspect = "Relevance"
evaluation_criteria = evaluation_criteria_relevance

# Number of Monte Carlo trials
num_trials = 2

# Few-shot exemplar size
few_shot_size = 3

# Define topk
topk = 3


# Perform Monte Carlo trials
criteria_set = []
candidate_cot_set = []

for trial in range(num_trials):
    print(f"Trial {trial + 1}:")

    # Randomly sample few-shot examples from the golden set
    sampled_examples = random.sample(dev_set_gold, few_shot_size)

    # Infer criteria based on sampled examples
    hs1, cs1, r1 = sampled_examples[0][0], sampled_examples[0][1], sampled_examples[0][2]
    hs2, cs2, r2 = sampled_examples[1][0], sampled_examples[1][1], sampled_examples[1][2]
    hs3, cs3, r3 = sampled_examples[2][0], sampled_examples[2][1], sampled_examples[2][2]

    few_shot_examples = cot_gen_few_shot_tempate(aspect, hs1, cs1, r1, hs2, cs2, r2, hs3, cs3, r3)
    criteria = candidate_cot_drafting_template(aspect, evaluation_criteria, few_shot_examples)
    
    # Store the inferred criteria
    criteria_set.append(criteria)

    candidate_cot = predict(prompt=criteria, system_description="", use_temperature_samping=True)
    print(candidate_cot)
    candidate_cot = postprocess_llm_response(candidate_cot)
    candidate_cot_set.append(candidate_cot)
    
    # print(f"Sampled Examples: {sampled_examples}")
    # print(f"Inferred Criteria: {criteria}")
    # print(f"Candidate COT: {candidate_cot}")


Trial 1:
 Evaluation Steps for Relevance of Counterspeeches:

1. Identify the Central Topic or Theme: Begin by understanding the main focus of the hate speech. This could include specific religious, ethnic, or racial groups, or broader themes such as national security, cultural differences, or terrorism.

2. Analyze the Counterspeech Content: Examine the content of the counterspeech to determine if it directly addresses the central topic or theme of the hate speech. The counterspeech should provide a clear and direct response to the hate speech, challenging its assumptions or providing a different perspective.

3. Evaluate the Counterspeech's Relevance to the Central Topic: Assess the degree to which the counterspeech's content is related to the central topic or theme of the hate speech. The more closely the counterspeech addresses the central topic, the higher the relevance score should be.

4. Consider the Counterspeech's Approach: Evaluate the approach taken by the counterspeech in 

In [22]:
from metric import spearman_correlation, kendall_tau
import json


def postprocess_llm_response(response):
    if isinstance(response, dict):
        return list(response.keys())[0]
    else:
        try:
            x = json.loads(response)
            return list(x.keys())[0]
        except:
            return str(response)


def evaluate_candidate_cot(aspect, candidate_cot, test_size=10) -> float:
    # Returns average correlation scores for each candidate cot.
    
    print(f"Total candidates to evaluate: {len(candidate_cot_set)}")
    scoring_prompt = None
    actual_scores = []
    predicted_scores = []
    exemplars = []

    if aspect == "Relevance":
        scoring_prompt = relevance_score_prompt_cot(evaluation_steps=candidate_cot)
    elif aspect == "Aggressiveness":
        scoring_prompt = aggressiveness_score_prompt_cot(evaluation_steps=candidate_cot)
    elif aspect == "Coherence":
        scoring_prompt = coherence_score_prompt_cot(evaluation_steps=candidate_cot)
    elif aspect == "Suitableness":
        scoring_prompt = suitableness_score_prompt_cot(evaluation_steps=candidate_cot)

    print(f"Evaluating on {test_size} datapoints")
    for i, row in tqdm(df_dev[:test_size].iterrows(), desc="Getting Predictions"):
        input_prompt = common_input_prompt(row['hatespeech'], row['predicted_counterspeech'])
        system_description = scoring_prompt

        response = predict(
            prompt=input_prompt, 
            system_description=system_description
        )
        predicted_score = extract_score(response)
        predicted_scores.append(predicted_score)

        if aspect == "Relevance":
            actual_score = row["relevance_score"]
        elif aspect == "Aggressiveness":
            actual_score = row["aggressiveness_score"]
        elif aspect == "Coherence":
            actual_score = row["coherence_score"]
        elif aspect == "Suitableness":
            actual_score = row["suitableness_score"]
        
        actual_scores.append(actual_score)

        exemplars.append(
            (row['hatespeech'], row['predicted_counterspeech'], actual_score, predicted_score)
        )

    spearman_score = spearman_correlation(actual_scores, predicted_scores)
    kendalltau_score = kendall_tau(actual_scores, predicted_scores)
    mean_score = (spearman_score + kendalltau_score)/2
    return (mean_score, exemplars)

evaluated_candidates = [(candidate_cot, evaluate_candidate_cot(aspect, candidate_cot)[0], evaluate_candidate_cot(aspect, candidate_cot)[1]) for candidate_cot in candidate_cot_set]
evaluated_candidates.sort(key=lambda x: x[1], reverse=True)
top_candidates = evaluated_candidates[:topk]

print("Top-performing Criteria:")
for candidate_cot, score, exemplars in top_candidates:
    print(f"Candidate COT: {candidate_cot}, Score: {score}, Exempars: {exemplars}")

Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:12,  1.30s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:20,  2.08s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:14,  1.43s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:19,  1.93s/it]

Top-performing Criteria:
Candidate COT:  Evaluation Steps for Relevance of Counterspeeches:

1. Identify the Central Topic or Theme: Begin by understanding the main focus of the hate speech. This could include specific religious, ethnic, or racial groups, or broader themes such as national security, cultural differences, or terrorism.

2. Analyze the Counterspeech Content: Examine the content of the counterspeech to determine if it directly addresses the central topic or theme of the hate speech. The counterspeech should provide a clear and direct response to the hate speech, challenging its assumptions or providing a different perspective.

3. Evaluate the Counterspeech's Relevance to the Central Topic: Assess the degree to which the counterspeech's content is related to the central topic or theme of the hate speech. The more closely the counterspeech addresses the central topic, the higher the relevance score should be.

4. Consider the Counterspeech's Approach: Evaluate the approach




In [23]:
def iterative_refinement_of_cot_candidate(candidate_cot, aspect) -> list[str]:
    # Performs interative refinement of top candidate via self-assessement prompting, and returns topk candidates with highest correlation
    exemplars = candidate_cot[2]
    refinement_prompts = []
    candidate_cot_set = []

    # Number of Monte Carlo trials
    num_trials = 3

    # Few-shot exemplar size
    few_shot_size = 3


    for trial in range(num_trials):
        print(f"Trial {trial + 1}:")

        # Randomly sample few-shot examples from the golden set
        sampled_examples = random.sample(exemplars, few_shot_size)

        # Infer criteria based on sampled examples
        hs1, cs1, r1, p1 = sampled_examples[0][0], sampled_examples[0][1], sampled_examples[0][2], sampled_examples[0][3]
        hs2, cs2, r2, p2 = sampled_examples[1][0], sampled_examples[1][1], sampled_examples[1][2], sampled_examples[1][3]
        hs3, cs3, r3, p3 = sampled_examples[2][0], sampled_examples[2][1], sampled_examples[2][2], sampled_examples[2][3]

        few_shot_examples = cot_error_few_shot_tempate(aspect, hs1, cs1, r1, p1, hs2, cs2, r2, p2, hs3, cs3, r3, p3)
        cot_refinement_prompt = candidate_cot_refinement_template(aspect, candidate_cot[0], few_shot_examples)
        # print(cot_refinement_prompt)

        refinement_prompts.append(cot_refinement_prompt)

        candidate_cot_refined = predict(prompt=cot_refinement_prompt, system_description="")
        candidate_cot_refined = postprocess_llm_response(candidate_cot_refined)
        candidate_cot_set.append(candidate_cot_refined)
        
    evaluated_candidates = [(candidate_cot, evaluate_candidate_cot(aspect, candidate_cot)[0], evaluate_candidate_cot(aspect, candidate_cot)[1]) for candidate_cot in candidate_cot_set]
    evaluated_candidates.sort(key=lambda x: x[1], reverse=True)
    top_candidate = evaluated_candidates[0]
    return top_candidate


top_candidates_refined = []
for candidate_cot in top_candidates:
    print(candidate_cot)
    refined_cot_candidate = iterative_refinement_of_cot_candidate(candidate_cot, aspect)
    top_candidates_refined.append(refined_cot_candidate)

top_candidates_refined.sort(key=lambda x: x[1], reverse=True)
top_candidates = top_candidates_refined[:topk]
top_candidate_global = top_candidates_refined[0]
top_candidate_global

(" Evaluation Steps for Relevance of Counterspeeches:\n\n1. Identify the Central Topic or Theme: Begin by understanding the main focus of the hate speech. This could include specific religious, ethnic, or racial groups, or broader themes such as national security, cultural differences, or terrorism.\n\n2. Analyze the Counterspeech Content: Examine the content of the counterspeech to determine if it directly addresses the central topic or theme of the hate speech. The counterspeech should provide a clear and direct response to the hate speech, challenging its assumptions or providing a different perspective.\n\n3. Evaluate the Counterspeech's Relevance to the Central Topic: Assess the degree to which the counterspeech's content is related to the central topic or theme of the hate speech. The more closely the counterspeech addresses the central topic, the higher the relevance score should be.\n\n4. Consider the Counterspeech's Approach: Evaluate the approach taken by the counterspeech in

Trial 2:
Trial 3:
Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:13,  1.38s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:14,  1.45s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:12,  1.29s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:13,  1.38s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:12,  1.26s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:12,  1.27s/it]


(" Evaluation Steps for Relevance of Counterspeeches:\n\n1. Identify the Central Topic or Theme: Clearly identify the central topic or theme of the hate speech to understand what the counterspeech should address.\n\n2. Analyze the Hate Speech: Break down the hate speech into key elements, such as the targeted group, the negative sentiment, and any specific claims or arguments.\n\n3. Compare the Counterspeech to the Hate Speech: Examine the counterspeech to determine if it directly addresses the central topic or theme of the hate speech and counters the negative sentiment or claims made in the hate speech.\n\n4. Evaluate the Scope of the Counterspeech: Assess whether the counterspeech focuses on the specific hate speech example or if it addresses broader issues related to the hate speech.\n\n5. Check for Relevance to the Targeted Group: Determine if the counterspeech is relevant to the group targeted by the hate speech, addressing their specific experiences or concerns.\n\n6. Assess the

Getting Predictions: 10it [00:12,  1.25s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:16,  1.66s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:19,  1.95s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:18,  1.83s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:25,  2.59s/it]


Total candidates to evaluate: 2
Evaluating on 10 datapoints


Getting Predictions: 10it [00:20,  2.03s/it]


(" Evaluation Steps for Relevance of Counterspeeches (Refined):\n\n1. Identify the Central Topic or Theme: Comprehend the main focus of the hate speech, which could include specific groups (e.g., racial, ethnic, religious) or broader themes (e.g., cultural differences, national security, terrorism).\n\n2. Address the Central Topic: Evaluate if the counterspeech directly responds to the central topic or theme of the hate speech, offering a clear and direct challenge to its assumptions or providing a different perspective.\n\n3. Maintain Topic Relevance: Assess the degree to which the counterspeech's content is related to the central topic or theme of the hate speech. A high relevance score indicates a strong connection between the counterspeech and the central topic.\n\n4. Consider the Approach: Evaluate the approach taken by the counterspeech in addressing the central topic or theme. Effective counterspeeches often provide facts, refute stereotypes, promote empathy, and encourage under