In [17]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import classification_report
from scipy.stats import ttest_rel
from statsmodels.stats.contingency_tables import mcnemar
import logging
import logging
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from torch.optim import AdamW

In [5]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
# Suppress transformers warnings
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

In [2]:
# Install required packages (if not already installed)
!pip install datasets transformers seqeval pandas numpy scipy statsmodels torch

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia

In [7]:
# Load the Tamil NER dataset
dataset = load_dataset("xtreme", "PAN-X.ta")
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

README.md:   0%|          | 0.00/131k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/919k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/63.3k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
# Define label list
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

# Define compute_metrics function (assumed from your original code)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    pred_labels = [
        [label_list[p] for p, l in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    report = classification_report(true_labels, pred_labels, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"],
    }

# Preprocessing functions
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != previous_word_id:
            new_labels.append(labels[word_id])
        else:
            new_labels.append(labels[word_id] if label_list[labels[word_id]].startswith("I-") else -100)
        previous_word_id = word_id
    return new_labels

def preprocess_data(batch, tokenizer):
    tokenized_inputs = tokenizer(
        batch["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128
    )
    all_word_ids = [tokenized_inputs.word_ids(i) for i in range(len(batch["tokens"]))]
    all_aligned_labels = [
        align_labels_with_tokens(labels, word_ids) for labels, word_ids in zip(batch["ner_tags"], all_word_ids)
    ]
    tokenized_inputs["labels"] = all_aligned_labels
    return tokenized_inputs

In [9]:
# Load tokenizers and models
tamilbert_tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/tamil-bert")
mbert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [10]:
# Replace with your Kaggle model paths or Hugging Face Hub paths
tamilbert_model_path = "/kaggle/input/tamilbert-xtreme-5epochs/tamilbert_xtreme_5epochs"  # Update with actual path
mbert_model_path = "/kaggle/input/d/dhanushmohan/mbert-xtreme-5epochs/kaggle/working/mbert_xtreme_5epochs"  # Update with actual path

tamilbert_model = AutoModelForTokenClassification.from_pretrained(
    tamilbert_model_path, num_labels=len(label_list)
)
mbert_model = AutoModelForTokenClassification.from_pretrained(
    mbert_model_path, num_labels=len(label_list)
)

In [11]:
# Preprocess dataset
train_dataset_tamilbert = train_data.map(lambda batch: preprocess_data(batch, tamilbert_tokenizer), batched=True)
validation_dataset_tamilbert = validation_data.map(lambda batch: preprocess_data(batch, tamilbert_tokenizer), batched=True)
test_dataset_tamilbert = test_data.map(lambda batch: preprocess_data(batch, tamilbert_tokenizer), batched=True)

train_dataset_mbert = train_data.map(lambda batch: preprocess_data(batch, mbert_tokenizer), batched=True)
validation_dataset_mbert = validation_data.map(lambda batch: preprocess_data(batch, mbert_tokenizer), batched=True)
test_dataset_mbert = test_data.map(lambda batch: preprocess_data(batch, mbert_tokenizer), batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
# Define data collators
tamilbert_data_collator = DataCollatorForTokenClassification(tamilbert_tokenizer)
mbert_data_collator = DataCollatorForTokenClassification(mbert_tokenizer)

In [18]:
# Set up trainers
tamilbert_trainer = Trainer(
    model=tamilbert_model,
    args=TrainingArguments(
        output_dir="./tamilbert_results",
        eval_strategy="epoch",
        save_strategy="no",
        per_device_eval_batch_size=8,
        report_to="none",
    ),
    eval_dataset=test_dataset_tamilbert,
    tokenizer=tamilbert_tokenizer,
    data_collator=tamilbert_data_collator,
    compute_metrics=compute_metrics,
)

mbert_trainer = Trainer(
    model=mbert_model,
    args=TrainingArguments(
        output_dir="./mbert_results",
        eval_strategy="epoch",
        save_strategy="no",
        per_device_eval_batch_size=8,
        report_to="none",
    ),
    eval_dataset=test_dataset_mbert,
    tokenizer=mbert_tokenizer,
    data_collator=mbert_data_collator,
    compute_metrics=compute_metrics,
)

  tamilbert_trainer = Trainer(
  mbert_trainer = Trainer(


In [19]:
# Evaluate models
tamilbert_metrics = tamilbert_trainer.evaluate()
mbert_metrics = mbert_trainer.evaluate()

print("\n=== TamilBERT Evaluation Metrics ===")
tamilbert_metrics_df = pd.DataFrame([tamilbert_metrics])
print(tamilbert_metrics_df)

print("\n=== mBERT Evaluation Metrics ===")
mbert_metrics_df = pd.DataFrame([mbert_metrics])
print(mbert_metrics_df)


=== TamilBERT Evaluation Metrics ===
   eval_loss  eval_model_preparation_time  eval_precision  eval_recall  \
0   0.409584                       0.0031        0.843128     0.873489   

    eval_f1  eval_runtime  eval_samples_per_second  eval_steps_per_second  
0  0.857992        4.7613                  210.028                 26.253  

=== mBERT Evaluation Metrics ===
   eval_loss  eval_model_preparation_time  eval_precision  eval_recall  \
0   0.248267                        0.003        0.826595     0.871878   

    eval_f1  eval_runtime  eval_samples_per_second  eval_steps_per_second  
0  0.848506        4.4107                  226.723                  28.34  


In [44]:
# Get token-level predictions
def get_token_predictions(trainer, dataset):
    predictions, labels, _ = trainer.predict(dataset)
    predictions = np.argmax(predictions, axis=-1)
    true_labels = [[l for l in label if l != -100] for label in labels]
    pred_labels = [
        [p for p, l in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    return true_labels, pred_labels

tamilbert_true, tamilbert_pred = get_token_predictions(tamilbert_trainer, test_dataset_tamilbert)
mbert_true, mbert_pred = get_token_predictions(mbert_trainer, test_dataset_mbert)

In [46]:
# McNemar's Test
def perform_mcnemar_test(true_labels, pred_labels_1, pred_labels_2):
    # Flatten predictions and true labels
    flat_true = [label for sent in true_labels for label in sent]
    flat_pred_1 = [label for sent in pred_labels_1 for label in sent]
    flat_pred_2 = [label for sent in pred_labels_2 for label in sent]
    
    # Ensure same length
    min_len = min(len(flat_true), len(flat_pred_1), len(flat_pred_2))
    flat_true = flat_true[:min_len]
    flat_pred_1 = flat_pred_1[:min_len]
    flat_pred_2 = flat_pred_2[:min_len]
    
    # Create contingency table
    both_correct = 0
    model1_correct_model2_incorrect = 0
    model1_incorrect_model2_correct = 0
    both_incorrect = 0
    
    for true, pred1, pred2 in zip(flat_true, flat_pred_1, flat_pred_2):
        if pred1 == true and pred2 == true:
            both_correct += 1
        elif pred1 == true and pred2 != true:
            model1_correct_model2_incorrect += 1
        elif pred1 != true and pred2 == true:
            model1_incorrect_model2_correct += 1
        else:
            both_incorrect += 1
    
    # Contingency table
    table = [[both_correct, model1_correct_model2_incorrect],
             [model1_incorrect_model2_correct, both_incorrect]]
    
    # Perform McNemar's test
    result = mcnemar(table, exact=True)
    print("\n=== McNemar's Test Results ===")
    print(f"Contingency Table:\n{table}")
    print(f"Statistic: {result.statistic}, p-value: {result.pvalue}")
    print(f"Significant difference (p < 0.05): {result.pvalue < 0.05}")



# Run statistical tests
perform_mcnemar_test(tamilbert_true, tamilbert_pred, mbert_pred)



=== McNemar's Test Results ===
Contingency Table:
[[2247, 5450], [97, 311]]
Statistic: 97.0, p-value: 0.0
Significant difference (p < 0.05): True


In [47]:
def get_misclassification_examples(true_labels, pred_labels_1, pred_labels_2, input_tokens, num_examples=10):
    tamilbert_correct = []
    mbert_correct = []

    for true_seq, pred_seq1, pred_seq2, tokens in zip(true_labels, pred_labels_1, pred_labels_2, input_tokens):
        for true, p1, p2, token in zip(true_seq, pred_seq1, pred_seq2, tokens):
            if len(token) <= 1:
                continue

            # TamilBERT correct, mBERT wrong
            if p1 == true and p2 != true and len(tamilbert_correct) < num_examples:
                tamilbert_correct.append({
                    "token": token,
                    "true_label": true,
                    "tamilbert_pred": p1,
                    "mbert_pred": p2
                })

            # mBERT correct, TamilBERT wrong
            elif p2 == true and p1 != true and len(mbert_correct) < num_examples:
                mbert_correct.append({
                    "token": token,
                    "true_label": true,
                    "tamilbert_pred": p1,
                    "mbert_pred": p2
                })

            if len(tamilbert_correct) >= num_examples and len(mbert_correct) >= num_examples:
                return tamilbert_correct, mbert_correct

    return tamilbert_correct, mbert_correct


In [50]:
# Assuming you have id2label dict like:
id2label = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-LOC', 4: 'I-LOC', 5: 'B-ORG', 6: 'I-ORG'}

t2_correct_m_wrong, m_correct_t2_wrong = get_comparative_misclassification_examples(
    tamilbert_true,
    tamilbert_pred,
    mbert_pred,
    input_tokens_tamilbert,  # tokens aligned to true_labels
    id2label,
    num_examples_each=10
)

print("\n=== TamilBERT correct, mBERT incorrect ===")
for ex in t2_correct_m_wrong:
    print(f"Token: {ex['token']}, True: {ex['true_label']}, TamilBERT: {ex['tamilbert_pred']}, mBERT: {ex['mbert_pred']}")

print("\n=== mBERT correct, TamilBERT incorrect ===")
for ex in m_correct_t2_wrong:
    print(f"Token: {ex['token']}, True: {ex['true_label']}, TamilBERT: {ex['tamilbert_pred']}, mBERT: {ex['mbert_pred']}")


input_tokens_tamilbert = extract_input_tokens(test_dataset_tamilbert, tamilbert_tokenizer)

t2_correct_m_wrong, m_correct_t2_wrong = get_misclassification_examples(
    tamilbert_true, tamilbert_pred, mbert_pred, input_tokens_tamilbert, num_examples=10
)

print("\n=== TamilBERT correct, mBERT incorrect ===")
for ex in t2_correct_m_wrong:
    print(f"Token: {ex['token']}")
    print(f"True Label: {ex['true_label']}")
    print(f"TamilBERT Prediction: {ex['tamilbert_pred']}")
    print(f"mBERT Prediction: {ex['mbert_pred']}")
    print("------")

print("\n=== mBERT correct, TamilBERT incorrect ===")
for ex in m_correct_t2_wrong:
    print(f"Token: {ex['token']}")
    print(f"True Label: {ex['true_label']}")
    print(f"TamilBERT Prediction: {ex['tamilbert_pred']}")
    print(f"mBERT Prediction: {ex['mbert_pred']}")
    print("------")



=== TamilBERT correct, mBERT incorrect ===
Token: ', True: O, TamilBERT: O, mBERT: I-LOC
Token: ', True: O, TamilBERT: O, mBERT: I-LOC
Token: அடங்கியுள்ள, True: O, TamilBERT: O, mBERT: I-LOC
Token: பள்ளி, True: O, TamilBERT: O, mBERT: B-LOC
Token: ஜெர்மனி, True: I-ORG, TamilBERT: I-ORG, mBERT: B-ORG
Token: கைப்பற்றியது, True: O, TamilBERT: O, mBERT: I-ORG
Token: ., True: O, TamilBERT: O, mBERT: I-ORG
Token: சிறிசேன, True: B-PER, TamilBERT: B-PER, mBERT: O
Token: அரசுக்கும், True: O, TamilBERT: O, mBERT: I-PER
Token: தனது, True: O, TamilBERT: O, mBERT: I-PER

=== mBERT correct, TamilBERT incorrect ===
Token: சோழ, True: B-ORG, TamilBERT: B-LOC, mBERT: B-ORG
Token: சபை, True: I-LOC, TamilBERT: O, mBERT: I-LOC
Token: டி, True: B-LOC, TamilBERT: B-ORG, mBERT: B-LOC
Token: கோட், True: I-LOC, TamilBERT: I-ORG, mBERT: I-LOC
Token: ##டினை, True: I-LOC, TamilBERT: I-ORG, mBERT: I-LOC
Token: கல, True: B-LOC, TamilBERT: O, mBERT: B-LOC
Token: பேராசிரியர், True: O, TamilBERT: B-PER, mBERT: O
Token