In [2]:
import torch

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print(
    "GPU Name:",
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU",
)

PyTorch Version: 2.5.1+cu121
CUDA Available: True
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [7]:
import os
import re
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoTokenizer

In [9]:
# Define your label mapping
label2id = {
    "O": 0,
    "B-org": 1,
    "I-org": 2,
    "B-per": 3,
    "I-per": 4,
    "B-geo": 5,
    "I-geo": 6,
    # Add more if needed
}

# === Step 1: Load raw dataset ===
hindi_dataset = load_dataset(
    "text",
    data_files={
        "train": r"E:\Research\Datasets\updated_adapter_data\Source_language( Task adapter)\hindi\naamapadam-train_mapped.txt",
        "test": r"E:\Research\Datasets\updated_adapter_data\Source_language( Task adapter)\hindi\naamapadam-test_mapped.txt",
        "validation": r"E:\Research\Datasets\updated_adapter_data\Source_language( Task adapter)\hindi\naamapadam-validation_mapped.txt",
    },
)
marathi_dataset = load_dataset(
    "text",
    data_files={
        "train": r"E:\Research\Datasets\Adapter_NER_data\Source_language( Task adapter)\marathi\naamapadam-train_mapped.txt",
        "test": r"E:\Research\Datasets\Adapter_NER_data\Source_language( Task adapter)\marathi\naamapadam-test_mapped.txt",
        "validation": r"E:\Research\Datasets\Adapter_NER_data\Source_language( Task adapter)\marathi\naamapadam-validation_mapped.txt",
    },
)
dataset = {
    "train": concatenate_datasets([hindi_dataset["train"], marathi_dataset["train"]]),
    "validation": concatenate_datasets([hindi_dataset["validation"], marathi_dataset["validation"]])
}

# === Step 2: Parse text lines ===
def parse_token_tag_pairs(split):
    lines = [line["text"].strip() for line in dataset[split] if line["text"].strip()]
    samples = []

    for i in range(0, len(lines), 2):
        if not lines[i].startswith("TOKENS:") or not lines[i + 1].startswith("TAGS:"):
            continue  # Skip invalid

        tokens = lines[i].replace("TOKENS:", "").strip().split()
        tags = lines[i + 1].replace("TAGS:", "").strip().split()

        if len(tokens) != len(tags):
            continue

        samples.append(
            {"tokens": tokens, "ner_tags": [label2id.get(tag, 0) for tag in tags]}
        )

    return samples


# === Step 3: Create DatasetDict ===
hf_dataset = DatasetDict(
    {
        "train": Dataset.from_list(parse_token_tag_pairs("train")),
        "validation": Dataset.from_list(parse_token_tag_pairs("validation")),
    }
)

# === Step 4: Inspect ===
print("Example sample:\n", hf_dataset["train"][0])
print("Train size:", len(hf_dataset["train"]))  

Example sample:
 {'tokens': ['सेक्टर', '55/56', 'के', 'एसएचओ', 'अरविंद', 'कुमार', 'ने', 'बताया', 'कि', 'इस', 'मामले', 'में', 'आईपीसी', 'की', 'धारा', '376', '-', 'डी', '(', 'गैंगरेप', ')', 'के', 'तहत', 'मामला', 'दर्ज', 'कर', 'लिया', 'गया', 'है', '।'], 'ner_tags': [0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
Train size: 1382248


In [10]:
print(hf_dataset)
first_50 = hf_dataset["train"].select(range(50))
print(first_50)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1382248
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 15760
    })
})
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 50
})


In [11]:
# === Step 4: Tokenization function ===
model_checkpoint = "ai4bharat/indic-bert"  # 🔁 Change if needed
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)  # 🔁 Change if needed


def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)  # Special tokens
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])  # Use label of first subword
        else:
            labels.append(-100)  # Ignore subword tokens
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Apply the function to all splits
tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=False)

# Optional check
print(tokenized_dataset["train"][0])

Map: 100%|██████████| 1382248/1382248 [15:36<00:00, 1476.13 examples/s]
Map: 100%|██████████| 15760/15760 [00:06<00:00, 2301.75 examples/s]


{'tokens': ['सेक्टर', '55/56', 'के', 'एसएचओ', 'अरविंद', 'कुमार', 'ने', 'बताया', 'कि', 'इस', 'मामले', 'में', 'आईपीसी', 'की', 'धारा', '376', '-', 'डी', '(', 'गैंगरेप', ')', 'के', 'तहत', 'मामला', 'दर्ज', 'कर', 'लिया', 'गया', 'है', '।'], 'ner_tags': [0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [2, 30265, 8430, 4415, 429, 13043, 1883, 90621, 13734, 109832, 22901, 236, 78701, 1134, 1883, 70, 2092, 29073, 2092, 941, 89842, 1883, 14855, 69922, 13, 4086, 20, 2344, 8213, 1551, 28, 1883, 1694, 2092, 29073, 1902, 1325, 68, 55450, 2344, 1134, 4384, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
train_dataset = tokenized_dataset["train"].select(range(50))
print(train_dataset.features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [13]:
print("Training examples after filtering:", len(tokenized_dataset["train"]))
print("Validation examples:", len(tokenized_dataset["validation"]))

Training examples after filtering: 1382248
Validation examples: 15760


In [14]:
print(tokenized_dataset["train"].features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [15]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),  # Ensure this matches your label count
    id2label={v: k for k, v in label2id.items()},
    label2id=label2id,
)

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Remove non-numeric columns from tokenized_dataset
columns_to_remove = ["tokens", "ner_tags"]  # keep only model input columns
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in columns_to_remove if col in tokenized_dataset["train"].column_names]
)

print(tokenized_dataset["train"])  # Should only show input_ids, attention_mask, labels

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1382248
})


In [17]:
from adapters import AutoAdapterModel
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "ai4bharat/indic-bert",
)
model = AutoAdapterModel.from_pretrained(
    "ai4bharat/indic-bert",
    config=config,
)

In [18]:
from adapters import AdapterConfig

task_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=16)
"""config = AdapterConfig.load("pfeiffer", non_linearity="relu", reduction_factor=2)
model.load_adapter("hi/wiki@ukp", config=config)"""
model.add_adapter("hindi_marathi_adapter", config=task_adapter_config)
model.add_tagging_head("hindi_marathi_adapter", num_labels=len(label2id))

In [19]:
model.train_adapter("hindi_marathi_adapter")

There are adapters available but none are activated for the forward pass.


In [20]:
model.set_active_adapters("hindi_marathi_adapter")

In [23]:
from adapters import AdapterTrainer
from transformers import TrainingArguments


training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    output_dir="./hindi_marathi",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)


trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

In [None]:
trainer.train()

In [None]:
model.save_adapter(
    "./results/hindi_marathi",
    "hindi_marathi_adapter",
)