In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoConfig

In [None]:
# Data Setup
df = pd.read_csv("hate_speech_hindi_final.csv")
labels_list = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']
NUM_LABELS = len(labels_list)
df[labels_list] = df[labels_list].astype(int)
TEXT_COL = "text_no_stopwords"

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[labels_list].values.argmax(axis=1))

In [None]:
MODEL_NAME = "jplu/tf-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 5

train_enc = tokenizer(train_df[TEXT_COL].astype(str).tolist(), truncation=True, padding="max_length", max_length=MAX_LEN)
val_enc = tokenizer(val_df[TEXT_COL].astype(str).tolist(), truncation=True, padding="max_length", max_length=MAX_LEN)

train_y = train_df[labels_list].values.astype("float32")
val_y   = val_df[labels_list].values.astype("float32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/512 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

In [None]:
def make_tf_dataset(encodings, labels):
    inputs = {"input_ids": np.array(encodings["input_ids"]), "attention_mask": np.array(encodings["attention_mask"])}
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    if len(labels) == len(train_y):
        dataset = dataset.shuffle(len(labels))
    return dataset.batch(BATCH_SIZE)

train_set = make_tf_dataset(train_enc, train_y)
val_set = make_tf_dataset(val_enc, val_y)

In [None]:
# Model Test Function
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS, problem_type = "multi_label_classification")

def run_model_test(lr, name):
    print(f"\n--- Starting Model: {name} (LR={lr:.1e}) ---")
    model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=[tf.keras.metrics.BinaryAccuracy(threshold=0.5)])

    model.fit(train_set, validation_data=val_set, epochs=EPOCHS, verbose=1)

    pred_outputs = model.predict(val_set)
    logits = pred_outputs.logits if hasattr(pred_outputs, "logits") else pred_outputs
    probs = tf.sigmoid(logits).numpy()
    y_pred = (probs >= 0.5).astype(int)
    y_true = val_y

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)

    print(f"\nResults for {name}:")
    print(f"  Exact Match Accuracy: {acc:.4f}")
    print(f"  Macro F1 Score: {macro_f1:.4f}")
    print("\n  Classification Report:")
    print(classification_report(y_true, y_pred, target_names=labels_list, zero_division=0))

    return {"Name": name, "LR": lr, "Exact Match Acc.": acc, "Macro F1": macro_f1}

In [None]:
LEARNING_RATES = [1e-5, 5e-5]
MODEL_NAMES = ["Model A (LR=1.0e-5)", "Model B (LR=5.0e-5)"]
all_results = []

for lr, name in zip(LEARNING_RATES, MODEL_NAMES):
    result = run_model_test(lr, name)
    all_results.append(result)

comparison_df = pd.DataFrame(all_results)
comparison_df["LR"] = comparison_df["LR"].apply(lambda x: f"{x:.1e}")

print("\n\nFinal Learning Rate Comparison:")
print(comparison_df.to_markdown(index=False, floatfmt=".4f"))


--- Starting Model: Model A (LR=1.0e-5) (LR=1.0e-05) ---


tf_model.h5:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

Some layers of TFXLMRobertaForSequenceClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Results for Model A (LR=1.0e-5):
  Exact Match Accuracy: 0.7189
  Macro F1 Score: 0.7634

  Classification Report:
              precision    recall  f1-score   support

  defamation       0.88      0.71      0.79      2000
        hate       0.73      0.72      0.72      1996
    non-hate       0.77      0.69      0.73      2000
    violence       0.80      0.75      0.77      2004
      vulgar       0.89      0.74      0.81      1975

   micro avg       0.81      0.72      0.76      9975
   macro avg       0.81      0.72      0.76      9975
weighted avg       0.81      0.72      0.76      9975
 samples avg       0.82      0.80      0.80      9975


--- Starting Model: Model B (LR=5.0e-5) (LR=5.0e-05) ---


All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

Some layers of TFXLMRobertaForSequenceClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Results for Model B (LR=5.0e-5):
  Exact Match Accuracy: 0.0000
  Macro F1 Score: 0.0000

  Classification Report:
              precision    recall  f1-score   support

  defamation       0.00      0.00      0.00      2000
        hate       0.00      0.00      0.00      1996
    non-hate       0.00      0.00      0.00      2000
    violence       0.00      0.00      0.00      2004
      vulgar       0.00      0.00      0.00      1975

   micro avg       0.00      0.00      0.00      9975
   macro avg       0.00      0.00      0.00      9975
weighted avg       0.00      0.00      0.00      9975
 samples avg       0.00      0.00      0.00      9975



Final Learning Rate Comparison:
| Name                |     LR |   Exact Match Acc. |   Macro F1 |
|:--------------------|-------:|-------------------:|-----------:|
| Model A (LR=1.0e-5) | 0.0000 |             0.7189 |     0.7634 |
| Model B (LR=5.0e-5) | 0.0001 |             0.0000 |   

# **Different Thresholds**




In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoConfig

In [None]:
# Load dataset
df = pd.read_csv("hate_speech_hindi_final.csv")

# Multi-label columns
labels_list = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']
NUM_LABELS = len(labels_list)
df[labels_list] = df[labels_list].astype(int)


TEXT_COL = "text_no_stopwords"

# Train/Validation split
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df[labels_list].values.argmax(axis=1)
)


In [None]:
# Tokenizer & Encoding
MODEL_NAME = "jplu/tf-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 5
BEST_LR = 1e-5

# Tokenize text
train_enc = tokenizer(
    train_df[TEXT_COL].astype(str).tolist(),
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN
)

val_enc = tokenizer(
    val_df[TEXT_COL].astype(str).tolist(),
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN
)

# Labels as float32
train_y = train_df[labels_list].values.astype("float32")
val_y   = val_df[labels_list].values.astype("float32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/512 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

In [None]:
# Make tf.data Datasets
def make_tf_dataset(encodings, labels, shuffle=False):
    inputs = {
        "input_ids": np.array(encodings["input_ids"]),
        "attention_mask": np.array(encodings["attention_mask"])
    }
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    if shuffle:
        dataset = dataset.shuffle(len(labels))
    return dataset.batch(BATCH_SIZE)

train_set = make_tf_dataset(train_enc, train_y, shuffle=True)
val_set   = make_tf_dataset(val_enc,   val_y,   shuffle=False)

In [None]:
# 4. Build & Train Final Model
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)

best_model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=config
)

optimizer = tf.keras.optimizers.Adam(learning_rate=BEST_LR)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

best_model.compile(
    optimizer=optimizer,
    loss=loss_fn,
    metrics=[tf.keras.metrics.BinaryAccuracy(threshold=0.5)]
)

print("\n===== Training Final Model =====")
history = best_model.fit(
    train_set,
    validation_data=val_set,
    epochs=EPOCHS,
    verbose=1
)

tf_model.h5:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

Some layers of TFXLMRobertaForSequenceClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Training Final Model =====
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#Validation Probabilities
print("\n===== Getting Validation Predictions =====")
val_outputs = best_model.predict(val_set)

logits = val_outputs.logits if hasattr(val_outputs, "logits") else val_outputs
probs = tf.sigmoid(logits).numpy()

y_true = val_y


===== Getting Validation Predictions =====


In [None]:
#Threshold Evaluation Function


def evaluate_threshold(threshold, name=None):
    if name is None:
        name = f"Threshold = {threshold:.2f}"

    # Convert probabilities to binary predictions
    y_pred = (probs >= threshold).astype(int)

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)

    print(f"\n===== Results for {name} =====")
    print(f"  Threshold: {threshold:.2f}")
    print(f"  Exact Match Accuracy: {acc:.4f}")
    print(f"  Macro F1 Score:       {macro_f1:.4f}")
    print("\n  Classification Report:")
    print(classification_report(
        y_true,
        y_pred,
        target_names=labels_list,
        zero_division=0
    ))

    return {
        "Name": name,
        "Threshold": threshold,
        "Exact Match Acc.": acc,
        "Macro F1": macro_f1
    }


# Compare Threshold
THRESHOLDS = [0.3, 0.4]
all_thresh_results = []

for th in THRESHOLDS:
    res = evaluate_threshold(threshold=th, name=f"Model (thr={th:.1f})")
    all_thresh_results.append(res)

comparison_df = pd.DataFrame(all_thresh_results)
comparison_df["Threshold"] = comparison_df["Threshold"].apply(lambda x: f"{x:.2f}")

print("\n\n===== Final Threshold Comparison =====")
print(comparison_df.to_markdown(index=False, floatfmt=".4f"))


===== Results for Model (thr=0.3) =====
  Threshold: 0.30
  Exact Match Accuracy: 0.6957
  Macro F1 Score:       0.7638

  Classification Report:
              precision    recall  f1-score   support

  defamation       0.74      0.79      0.76      2000
        hate       0.73      0.73      0.73      1996
    non-hate       0.68      0.79      0.73      2000
    violence       0.77      0.80      0.79      2004
      vulgar       0.77      0.86      0.81      1975

   micro avg       0.74      0.79      0.76      9975
   macro avg       0.74      0.79      0.76      9975
weighted avg       0.74      0.79      0.76      9975
 samples avg       0.81      0.85      0.82      9975


===== Results for Model (thr=0.4) =====
  Threshold: 0.40
  Exact Match Accuracy: 0.7285
  Macro F1 Score:       0.7661

  Classification Report:
              precision    recall  f1-score   support

  defamation       0.81      0.74      0.78      2000
        hate       0.77      0.67      0.71      1996
