In [None]:
import os
from transformers import AutoModel
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only GPU 0
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "0"  # Enable CUDA DSA for better performance

device = torch.device("cuda:0")
model = AutoModel.from_pretrained("xlm-roberta-base")
model = model.to(device)
print(
    "GPU Name:",
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU",
)

In [None]:
from datasets import Dataset, DatasetDict, load_dataset

# Define your NER label mapping
label2id = {
    "O": 0,  # Outside any named entity
    "B-per": 1,
    "I-per": 2,
    "B-org": 3,
    "I-org": 4,
    "B-loc": 5,
    "I-loc": 6,
}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)


# Generic parsing function for a dataset file
def parse_token_tag_file(filepath, label2id):
    with open(filepath, encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    samples = []
    for i in range(0, len(lines), 2):
        if not lines[i].startswith("TOKENS:") or not lines[i + 1].startswith("TAGS:"):
            continue

        tokens = lines[i].replace("TOKENS:", "").strip().split()
        tags = lines[i + 1].replace("TAGS:", "").strip().split()

        if len(tokens) != len(tags):
            continue

        samples.append(
            {"tokens": tokens, "ner_tags": [label2id.get(tag, 0) for tag in tags]}
        )

    return samples


# Load Hindi dataset
hindi_train = parse_token_tag_file(
    "Datasets/updated_adapter_data/Source_language( Task adapter)/hindi/naamapadam-train_mapped.txt",
    label2id,
)
hindi_val = parse_token_tag_file(
    "Datasets/updated_adapter_data/Source_language( Task adapter)/hindi/naamapadam-test_mapped.txt",
    label2id,
)

hindi_dataset = DatasetDict(
    {
        "train": Dataset.from_list(hindi_train),
        "validation": Dataset.from_list(hindi_val),
    }
)

# Load Bhojpuri dataset
bhojpuri_train = parse_token_tag_file(
    "Datasets/bhojpuri/naamapadam-train_mapped.txt", label2id
)
bhojpuri_test = parse_token_tag_file(
    "Datasets/bhojpuri/naamapadam-test_mapped.txt", label2id
)

bhojpuri_dataset = DatasetDict(
    {
        "train": Dataset.from_list(bhojpuri_train),
        "test": Dataset.from_list(bhojpuri_test),
    }
)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "xlm-roberta-base"  # or "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    label2id=label2id,
    id2label={v: k for k, v in label2id.items()},
)

In [None]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
hindi_tokenized = hindi_dataset.map(tokenize_and_align_labels, batched=False)
bhojpuri_tokenized = bhojpuri_dataset.map(tokenize_and_align_labels, batched=False)

In [None]:
# Remove non-numeric columns from tokenized_dataset
columns_to_remove = ["tokens", "ner_tags"]  # keep only model input columns
hi_dataset = hindi_tokenized.remove_columns(
    [col for col in columns_to_remove if col in hindi_tokenized["train"].column_names]
)
bj_dataset = bhojpuri_tokenized.remove_columns(
    [
        col
        for col in columns_to_remove
        if col in bhojpuri_tokenized["train"].column_names
    ]
)
print(
    hi_dataset["train"],
    bj_dataset["train"],  # Should only show input_ids, attention_mask, labels
)  # Should only show input_ids, attention_mask, labels

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./hindi_ner_model",
    learning_rate=5e-5,
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hi_dataset["train"],  # Use a subset for faster training
    eval_dataset=bj_dataset["test"],
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
# Global store for confusion matrix
all_true_labels = []
all_pred_labels = []

from sklearn.metrics import precision_recall_fscore_support, accuracy_score


def compute_metrics(p):
    import numpy as np

    global all_true_labels, all_pred_labels

    preds = np.argmax(p.predictions, axis=-1)
    labels = p.label_ids

    true_labels = []
    pred_labels = []

    for pred_seq, label_seq in zip(preds, labels):
        for pred_idx, label_idx in zip(pred_seq, label_seq):
            if label_idx == -100:
                continue  # Skip padding
            true_tag = id2label[int(label_idx)]
            pred_tag = id2label[int(pred_idx)]
            if true_tag != "O":  # Ignore 'O' tags
                true_labels.append(true_tag)
                pred_labels.append(pred_tag)

    # Store for confusion matrix
    all_true_labels = true_labels
    all_pred_labels = pred_labels

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, pred_labels, average="macro", zero_division=0
    )
    acc = accuracy_score(true_labels, pred_labels)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": round(acc, 4),
    }

In [None]:
from transformers import TrainingArguments, Trainer

eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./eval_output",
        remove_unused_columns=False,
    ),
    eval_dataset=bj_dataset["test"],
    compute_metrics=compute_metrics,
)
eval_trainer.evaluate()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(true_labels, pred_labels, labels):
    cm = confusion_matrix(true_labels, pred_labels, labels=labels)

    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("NER Confusion Matrix (excluding 'O')")
    plt.show()


# Call this after evaluation
unique_labels = sorted(set(all_true_labels + all_pred_labels))
plot_confusion_matrix(all_true_labels, all_pred_labels, labels=unique_labels)