**HS-CN metrics calculated for all datasets**

---





This code reads a dataset from an Excel file and counts the occurrences of each category in the 'Category' column.
It then calculates and prints the imbalance degree by dividing the maximum count by the minimum, indicating class imbalance.

In [None]:
from collections import Counter
import numpy as np
import pandas as pd

df = pd.read_excel("dataset.xlsx")

category_counts = Counter(df['Category'])
print(category_counts)
imbalance_degree = max(category_counts.values()) / min(category_counts.values())
print(imbalance_degree)

Calculate HTER modified and all based on the corrected hs-cn dataset. Using only the dataset that was generated by the LLM and manually corrected by a human.

In [None]:

import pandas as pd
from nltk.metrics import edit_distance

def compute_hter(hypothesis, reference):
    hyp_tokens = str(hypothesis).strip().split()
    ref_tokens = str(reference).strip().split()
    edits = edit_distance(hyp_tokens, ref_tokens)
    return edits / len(ref_tokens) if len(ref_tokens) > 0 else 0

df = pd.read_excel("dataset-llm-generated.xlsx")

modified_df = df[df["Status"] == "Modified"].copy()

# HTER - All
df["HTER"] = df.apply(
    lambda row: compute_hter(row["Counter narrative"], row["Final Counter narrative"]), axis=1
)
hter_all = df["HTER"].mean()

# HTER - modified
modified_df["HTER"] = modified_df.apply(
    lambda row: compute_hter(row["Counter narrative"], row["Final Counter narrative"]), axis=1
)
hter_modified = modified_df["HTER"].mean()


# Display results
print(f"HTER - All: {hter_all:.4f}")
print(f"HTER - Modified: {hter_modified:.4f}")



Calculate the following quantitative metrics on the whole dataset


*   Repetition rate
*   Novelty
*   Vocabulary size
*   New Vocabulary
*   Reused Vocabulary









In [None]:
import pandas as pd
import os
from collections import Counter
import string

# Utility to tokenize
def tokenize(text):
    return text.lower().translate(str.maketrans("", "", string.punctuation)).split()

# Load versions
def load_versions(file_paths):
    versions = []
    for path in file_paths:
        df = pd.read_excel(path)
        hs = df['Hate speech'].dropna().astype(str).tolist()
        cn = df['Counter Narrative from dataset'].dropna().astype(str).tolist()
        versions.append({
            "hs": [tokenize(h) for h in hs],
            "cn": [tokenize(c) for c in cn]
        })
    return versions

# Repetition Rate
def repetition_rate(samples):
    total_tokens = 0
    repeated_tokens = 0

    for tokens in samples:
        if not tokens:
            continue
        token_counts = Counter(tokens)
        total_tokens += len(tokens)
        repeated_tokens += sum(count for tok, count in token_counts.items() if count > 1)

    if total_tokens == 0:
        return 0.0

    return (repeated_tokens / total_tokens) * 100

# Novelty: how many n-grams are new vs reference
def novelty(current, reference):
    current_ngrams = set(ng for sent in current for ng in zip(sent, sent[1:]))
    reference_ngrams = set(ng for sent in reference for ng in zip(sent, sent[1:]))
    novel = current_ngrams - reference_ngrams
    return len(novel) / len(current_ngrams) if current_ngrams else 0

def jaccard_similarity(current, reference):
    current_ngrams = set(ng for sent in current for ng in zip(sent, sent[1:]))
    reference_ngrams = set(ng for sent in reference for ng in zip(sent, sent[1:]))

    intersection = current_ngrams & reference_ngrams
    union = current_ngrams | reference_ngrams

    return len(intersection) / len(union) if union else 0


# Vocab size and overlap
def vocab_stats(current, reference):
    current_vocab = set(tok for sent in current for tok in sent)
    reference_vocab = set(tok for sent in reference for tok in sent)
    new = current_vocab - reference_vocab
    reused = current_vocab & reference_vocab
    return {
        "vocab_size": len(current_vocab),
        "new_vocab": len(new),
        "reused_vocab": len(reused)
    }


file_paths = [
    "dataset_v1",
    "dataset_v2",
    "dataset_v3"
]

versions = load_versions(file_paths)

for i in range(1, len(versions)):
    curr = versions[i]
    prev = versions[i - 1]
    cumulative = {
        "hs": sum([v["hs"] for v in versions[:i]], []),
        "cn": sum([v["cn"] for v in versions[:i]], [])
    }

    print(f"\n--- Comparing Version V{i+1} ---")

    # Repetition Rate
    print(f"Repetition Rate (HS): {repetition_rate(curr['hs']):.4f}")
    print(f"Repetition Rate (CN): {repetition_rate(curr['cn']):.4f}")

    # Novelty
    print(f"Novelty HS vs V1: {1 - jaccard_similarity(curr['hs'], versions[0]['hs']):.4f}")
    print(f"Novelty CN vs V1: {1 - jaccard_similarity(curr['cn'], versions[0]['cn']):.4f}")
    print(f"Novelty HS vs cumulative: {1 - jaccard_similarity(curr['hs'], cumulative['hs']):.4f}")
    print(f"Novelty CN vs cumulative: {1 - jaccard_similarity(curr['cn'], cumulative['cn']):.4f}")
    print(f"Novelty HS vs Vi-1: {1 - jaccard_similarity(curr['hs'], prev['hs']):.4f}")
    print(f"Novelty CN vs Vi-1: {1 - jaccard_similarity(curr['cn'], prev['cn']):.4f}")

    # Vocab
    vocab = vocab_stats(curr['cn'], prev['cn'])
    print(f"CN Vocab Size: {vocab['vocab_size']}")
    print(f"New Vocab (vs previous): {vocab['new_vocab']}")
    print(f"Reused Vocab (vs previous): {vocab['reused_vocab']}")




**Data Analytics:** <br/>
Word Cloud on Hate speech and Counter Narratives

In [None]:
pip install pandas matplotlib wordcloud

The below code processes a the hate speech counter-narrative dataset to extract meaningful words from hate speech and counter-narratives using basic preprocessing and SpaCy's Tamil tokenizer.

It then generates separate word clouds for each category based on word frequency, visualizing the most prominent terms.

In [None]:
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from spacy.lang.ta import Tamil

# Word cloud for the seed dataset
df = pd.read_csv('dataset.csv', encoding='UTF-8-SIG')


def preprocess(text):
    text = str(text)
    text = text.lower()
    return text

def is_valid_tamil_word(word):
    return (
        len(word) > 4 and
        all('\u0B80' <= char <= '\u0BFF' for char in word) and
        not re.match(r'^[a-zA-Z0-9]+$', word)  # exclude English/alphanumeric if any
    )

df['Hate speech'] = df['Hate speech'].apply(preprocess)
df['Counter Narrative from dataset'] = df['Counter Narrative from dataset'].apply(preprocess)

# Combine all hate speech and counter narrative texts
hate_speech_text = " ".join(df['Hate speech'].dropna())
counter_speech_text = " ".join(df['Counter Narrative from dataset'].dropna())

tamil_nlp = Tamil()
hate_tamil_doc = tamil_nlp(hate_speech_text)
hate_tamil_tokens = [token.text for token in hate_tamil_doc if is_valid_tamil_word(token.text)]
hate_tamil_tokens_counter = Counter(hate_tamil_tokens)

counter_tamil_doc = tamil_nlp(counter_speech_text)
counter_tamil_tokens = [token.text for token in counter_tamil_doc  if is_valid_tamil_word(token.text)]
counter_tamil_tokens_counter = Counter(counter_tamil_tokens)


def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white', font_path='NotoSansTamil_CondensedRegular.ttf').generate_from_frequencies(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title, fontsize=16)
    plt.tight_layout(pad=0)
    plt.show()

# Generate each word cloud
generate_wordcloud(hate_tamil_tokens_counter, "Word Cloud - Hate Speech")
generate_wordcloud(counter_tamil_tokens_counter, "Word Cloud - Counter Narratives")


The below code analyzes the text length distribution of Tamil hate speech and counter-narrative samples by plotting histograms using Seaborn.

It also visualizes how many characters are typically used in each category, helping to compare their verbosity and structure.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("tamil-hs-cn-seed-data.csv")  # Replace with your file path

# Clean and prepare data
df = df.dropna(subset=['Hate speech', 'Counter Narrative from dataset'])
df['Hate_speech_length'] = df['Hate speech'].apply(lambda x: len(str(x)))
df['Counter_Narrative_length'] = df['Counter Narrative from dataset'].apply(lambda x: len(str(x)))

# Set Seaborn theme for a professional look
sns.set(style="whitegrid", font_scale=1.2)

# Create the plot
plt.figure(figsize=(12, 6))

sns.histplot(df['Hate_speech_length'], bins=30, kde=False, color='#fb7f64', label='Hate Speech', alpha=0.7)
sns.histplot(df['Counter_Narrative_length'], bins=30, kde=False, color='#4fcafc', label='Counter Narrative', alpha=0.7)

# Titles and labels
plt.title('Distribution of Text Length in Hate Speech and Counter Narratives', fontsize=16, pad=20)
plt.xlabel('Number of characters per text', fontsize=12)
plt.ylabel('Number of text observations', fontsize=12)
plt.xticks(fontsize=11)
plt.xticks(range(0, 900, 100))
plt.yticks(fontsize=11)
plt.grid(axis='y', linestyle='--', linewidth=0.7, alpha=0.7)

# Legend
plt.legend(title='Category', title_fontsize=12, fontsize=11)

# Layout
plt.tight_layout()
plt.show()
