In [1]:
import os
from transformers import AutoModel
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only GPU 0
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "0"  # Enable CUDA DSA for better performance

device = torch.device("cuda:0")
model = AutoModel.from_pretrained("xlm-roberta-base")
model = model.to(device)
print(
    "GPU Name:",
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU",
)

  from .autonotebook import tqdm as notebook_tqdm


GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
import os
import re
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define your label mapping
label2id = {
    "O": 0,
    "B-org": 1,
    "I-org": 2,
    "B-per": 3,
    "I-per": 4,
    "B-geo": 5,
    "I-geo": 6,
    # Add more if needed
}

# === Step 1: Load raw dataset ===
dataset = load_dataset(
    "text",
    data_files={
        "train": r"E:\Research\Datasets\updated_adapter_data\Source_language( Task adapter)\hindi\naamapadam-train_mapped.txt",
        "test": r"E:\Research\Datasets\updated_adapter_data\Source_language( Task adapter)\hindi\naamapadam-test_mapped.txt",
        "validation": r"E:\Research\Datasets\updated_adapter_data\Source_language( Task adapter)\hindi\naamapadam-validation_mapped.txt",
    },
)


# === Step 2: Parse text lines ===
def parse_token_tag_pairs(split):
    lines = [line["text"].strip() for line in dataset[split] if line["text"].strip()]
    samples = []

    for i in range(0, len(lines), 2):
        if not lines[i].startswith("TOKENS:") or not lines[i + 1].startswith("TAGS:"):
            continue  # Skip invalid

        tokens = lines[i].replace("TOKENS:", "").strip().split()
        tags = lines[i + 1].replace("TAGS:", "").strip().split()

        if len(tokens) != len(tags):
            continue

        samples.append(
            {"tokens": tokens, "ner_tags": [label2id.get(tag, 0) for tag in tags]}
        )

    return samples


# === Step 3: Create DatasetDict ===
hf_dataset = DatasetDict(
    {
        "train": Dataset.from_list(parse_token_tag_pairs("train")),
        "validation": Dataset.from_list(parse_token_tag_pairs("validation")),
        "test": Dataset.from_list(parse_token_tag_pairs("test")),
    }
)

# === Step 4: Inspect ===
print("Example sample:\n", hf_dataset["train"][0])
print("Train size:", len(hf_dataset["train"]))

Example sample:
 {'tokens': ['सेक्टर', '55/56', 'के', 'एसएचओ', 'अरविंद', 'कुमार', 'ने', 'बताया', 'कि', 'इस', 'मामले', 'में', 'आईपीसी', 'की', 'धारा', '376', '-', 'डी', '(', 'गैंगरेप', ')', 'के', 'तहत', 'मामला', 'दर्ज', 'कर', 'लिया', 'गया', 'है', '।'], 'ner_tags': [0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
Train size: 927000


In [4]:
print(hf_dataset)
first_50 = hf_dataset["train"].select(range(50))
print(first_50)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 927000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13460
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 867
    })
})
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 50
})


In [5]:
# === Step 4: Tokenization function ===
model_checkpoint = "ai4bharat/indic-bert"  # 🔁 Change if needed
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)  # 🔁 Change if needed


def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)  # Special tokens
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])  # Use label of first subword
        else:
            labels.append(-100)  # Ignore subword tokens
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Apply the function to all splits
tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=False)

# Optional check
print(tokenized_dataset["train"][0])


Map: 100%|██████████| 927000/927000 [06:29<00:00, 2379.43 examples/s]
Map: 100%|██████████| 13460/13460 [00:05<00:00, 2454.92 examples/s]
Map: 100%|██████████| 867/867 [00:00<00:00, 2437.09 examples/s]

{'tokens': ['सेक्टर', '55/56', 'के', 'एसएचओ', 'अरविंद', 'कुमार', 'ने', 'बताया', 'कि', 'इस', 'मामले', 'में', 'आईपीसी', 'की', 'धारा', '376', '-', 'डी', '(', 'गैंगरेप', ')', 'के', 'तहत', 'मामला', 'दर्ज', 'कर', 'लिया', 'गया', 'है', '।'], 'ner_tags': [0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [2, 30265, 8430, 4415, 429, 13043, 1883, 90621, 13734, 109832, 22901, 236, 78701, 1134, 1883, 70, 2092, 29073, 2092, 941, 89842, 1883, 14855, 69922, 13, 4086, 20, 2344, 8213, 1551, 28, 1883, 1694, 2092, 29073, 1902, 1325, 68, 55450, 2344, 1134, 4384, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 




In [6]:
train_dataset = tokenized_dataset["train"].select(range(50))
print(train_dataset.features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [7]:
print("Training examples after filtering:", len(tokenized_dataset["train"]))
print("Validation examples:", len(tokenized_dataset["validation"]))


Training examples after filtering: 927000
Validation examples: 13460


In [8]:

print(tokenized_dataset["train"].features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [9]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),  # Ensure this matches your label count
    id2label={v: k for k, v in label2id.items()},
    label2id=label2id,
)

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Remove non-numeric columns from tokenized_dataset
columns_to_remove = ["tokens", "ner_tags"]  # keep only model input columns
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in columns_to_remove if col in tokenized_dataset["train"].column_names]
)

print(
    tokenized_dataset["train"]
)  # Should only show input_ids, attention_mask, labels

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 927000
})


In [11]:
from adapters import AutoAdapterModel
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "ai4bharat/indic-bert",
)
model = AutoAdapterModel.from_pretrained(
    "ai4bharat/indic-bert",
    config=config,
)

In [None]:
from adapters import AdapterConfig
task_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=16)
config = AdapterConfig.load("pfeiffer", non_linearity="relu", reduction_factor=2)
model.load_adapter("hi/wiki@ukp", config=config)
model.add_adapter("hindi_adapter", config=task_adapter_config)
model.add_tagging_head("hindi_adapter", num_labels=len(label2id))

In [15]:
model.train_adapter("hindi_adapter")

There are adapters available but none are activated for the forward pass.


In [17]:
model.set_active_adapters("hindi_adapter")

In [None]:
from adapters.composition import Stack

model.active_adapters = Stack("hi", "hindi_adapter")

In [18]:
from adapters import AdapterTrainer
from transformers import TrainingArguments
from datasets import concatenate_datasets

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    output_dir="./results",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

train_dataset = concatenate_datasets([dataset["train"], dataset["validation"]])

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1000)),
    eval_dataset=tokenized_dataset["validation"],
)




In [19]:
trainer.train()

Step,Training Loss
100,0.9294
200,0.4929
300,0.4535
400,0.4206
500,0.4024
600,0.3914


TrainOutput(global_step=640, training_loss=0.5072933301329613, metrics={'train_runtime': 279.6702, 'train_samples_per_second': 71.513, 'train_steps_per_second': 2.288, 'total_flos': 120694563840000.0, 'train_loss': 0.5072933301329613, 'epoch': 20.0})

In [20]:
model.save_adapter(
    "./results/hindi_adapter",
    "hindi_adapter",
)

In [21]:
print(dataset["train"][0])
print(hf_dataset["train"][0])

{'text': 'TOKENS: सेक्टर 55/56 के एसएचओ अरविंद कुमार ने बताया कि इस मामले में आईपीसी की धारा 376 - डी ( गैंगरेप ) के तहत मामला दर्ज कर लिया गया है ।'}
{'tokens': ['सेक्टर', '55/56', 'के', 'एसएचओ', 'अरविंद', 'कुमार', 'ने', 'बताया', 'कि', 'इस', 'मामले', 'में', 'आईपीसी', 'की', 'धारा', '376', '-', 'डी', '(', 'गैंगरेप', ')', 'के', 'तहत', 'मामला', 'दर्ज', 'कर', 'लिया', 'गया', 'है', '।'], 'ner_tags': [0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [22]:
model.load_adapter("bhojpuri_lang_adapter")

'bhojpuri_lang_adapter'

In [23]:
from adapters.composition import Stack
model.active_adapters = Stack("bhojpuri_lang_adapter", "hindi_adapter")

In [37]:
# Define your label mapping
label2id = {
    "O": 0,
    "B-org": 1,
    "I-org": 2,
    "B-per": 3,
    "I-per": 4,
    "B-geo": 5,
    "I-geo": 6,
    # Add more if needed
}
dataset = load_dataset(
    "text",
    data_files={
        "train": r"E:\Research\Datasets\updated_adapter_data\Target_language( language adapter)\bhojpuri\naamapadam-train_mapped.txt",
        "test": r"E:\Research\Datasets\updated_adapter_data\Target_language( language adapter)\bhojpuri\naamapadam-test_mapped.txt",
    },
)
# === Step 2: Parse text lines ===
def parse_token_tag_pairs(split):
    lines = [line["text"].strip() for line in dataset[split] if line["text"].strip()]
    samples = []

    for i in range(0, len(lines), 2):
        if not lines[i].startswith("TOKENS:") or not lines[i + 1].startswith("TAGS:"):
            continue  # Skip invalid

        tokens = lines[i].replace("TOKENS:", "").strip().split()
        tags = lines[i + 1].replace("TAGS:", "").strip().split()

        if len(tokens) != len(tags):
            continue

        samples.append(
            {"tokens": tokens, "ner_tags": [label2id.get(tag, 0) for tag in tags]}
        )

    return samples


# === Step 3: Create DatasetDict ===
dataset_bj = DatasetDict(
    {
        "train": Dataset.from_list(parse_token_tag_pairs("train")),
        "test": Dataset.from_list(parse_token_tag_pairs("test")),
    }
)

# === Step 4: Inspect ===
print("Example sample:\n", dataset_bj["train"][0])
print("Train size:", len(dataset_bj["train"]))

Example sample:
 {'tokens': ['दोसर', 'ई', 'कि', 'ई', 'पूरा', 'कहानी', 'कल्पना', 'ह', ',', 'कवनो', 'कारण', 'से', 'ई', 'सही', 'साबित', 'हो', 'जाव', 'त', 'एकर', 'जिम्मेदारी', 'हमरा', 'पर', 'मत', 'डालल', 'जाव़', 'कहानी', '२', 'जवार', 'भर', 'में', 'केहू', 'के', 'मजाल', 'ना', 'रहे', 'कि', 'भोला', 'पहलवान', 'का', 'सोझा', 'खड़ा', 'होखे', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0]}
Train size: 11544


In [38]:
print(
    dataset_bj["train"].features
)  # Should only show input_ids, attention_mask, labels

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [39]:
dataset_bj = dataset_bj.map(tokenize_and_align_labels, batched=False)

Map: 100%|██████████| 11544/11544 [00:04<00:00, 2764.92 examples/s]
Map: 100%|██████████| 4948/4948 [00:01<00:00, 2884.95 examples/s]


In [40]:
print(dataset_bj)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11544
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4948
    })
})


In [41]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),  # Ensure this matches your label count
    id2label={v: k for k, v in label2id.items()},
    label2id=label2id,
)

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# Remove non-numeric columns from tokenized_dataset
columns_to_remove = ["tokens", "ner_tags"]  # keep only model input columns
dataset_bj = dataset_bj.remove_columns(
    [col for col in columns_to_remove if col in dataset_bj["train"].column_names]
)

print(tokenized_dataset["train"])  # Should only show input_ids, attention_mask, labels

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 927000
})


In [44]:
print(
    dataset_bj["train"]
)  # Should only show input_ids, attention_mask, labels

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 11544
})


In [45]:
import numpy as np
from transformers import EvalPrediction
from datasets import concatenate_datasets

def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=-1)
    mask = p.label_ids != -100
    correct = (preds == p.label_ids) & mask
    acc = correct.sum() / mask.sum()
    return {"acc": acc}

dataset_bj = concatenate_datasets([dataset_bj["train"], dataset_bj["test"]])
eval_trainer = AdapterTrainer(
    model=model,
    args=TrainingArguments(
        output_dir="./eval_output",
        remove_unused_columns=False,
    ),
    eval_dataset=dataset_bj,
    compute_metrics=compute_accuracy,
)
eval_trainer.evaluate()

{'eval_loss': 2.024326801300049,
 'eval_model_preparation_time': 0.0,
 'eval_acc': 0.08835633549110958,
 'eval_runtime': 111.9441,
 'eval_samples_per_second': 147.324,
 'eval_steps_per_second': 18.42}