In [1]:
import torch

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print(
    "GPU Name:",
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU",
)

PyTorch Version: 2.5.1+cu121
CUDA Available: True
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    AutoConfig,
    AutoModelForMaskedLM,
)
from adapters import AutoAdapterModel, AdapterConfig, AdapterTrainer

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# === 1. Load the raw Bhojpuri text file ===
dataset_path = "Datasets\Bhojpuri_unlabeeeld data.txt"

with open(dataset_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
# Remove blank lines and strip whitespaces
text_data = [line.strip() for line in lines if line.strip()]

# Convert to HuggingFace Dataset
dataset = Dataset.from_dict({"text": text_data})

In [3]:
# === 2. Load tokenizer and model ===
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [5]:
# === 4. Tokenize dataset ===
# For CLM, ensure tokenizer has a pad_token and that labels are shifted input_ids
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))  # Resize again if adding pad_token


def tokenize_function(examples):
    return tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=128
    )


tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)

Map: 100%|██████████| 15590/15590 [00:01<00:00, 14160.48 examples/s]


In [6]:
print(f"Tokenizer vocabulary size: {tokenizer.vocab_size}")
print(f"Model embedding layer vocabulary size: {model.config.vocab_size}")

Tokenizer vocabulary size: 200000
Model embedding layer vocabulary size: 200000


In [7]:
# Remove non-numeric columns from tokenized_dataset
columns_to_remove = ["tokens", "ner_tags"]  # keep only model input columns
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in columns_to_remove if col in tokenized_dataset.column_names]
)

print(tokenized_dataset)  # Should only show input_ids, attention_mask, labels

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 15590
})


In [8]:
max_token_id = 0
for example in tokenized_dataset:
    for input_id in example["input_ids"]:
        if input_id > max_token_id:
            max_token_id = input_id
print(f"Maximum token ID in dataset: {max_token_id}")

Maximum token ID in dataset: 199646


In [9]:
print(tokenized_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 15590
})


In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [4]:
from adapters import AutoAdapterModel
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "ai4bharat/indic-bert",
)
model = AutoAdapterModel.from_pretrained(
    "ai4bharat/indic-bert",
    config=config,
)

In [None]:
  # <-- Add this line

In [7]:
# === 3. Add a language adapter ===
from adapters import AutoAdapterModel
from adapters import AdapterConfig

lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
model.add_masked_lm_head("indic_bhojpuri_lang_adapter", overwrite_ok=True)
model.train_adapter("indic_bhojpuri_lang_adapter")

ValueError: No adapter with name 'indic_bhojpuri_lang_adapter' found. Please make sure that all specified adapters are correctly loaded.

In [6]:
model.set_active_adapters("indic_bhojpuri_lang_adapter")

ValueError: No adapter with name 'indic_bhojpuri_lang_adapter' found. Please make sure that all specified adapters are correctly loaded.

In [23]:
# === 6. Training arguments ===
training_args = TrainingArguments(
    output_dir="./indic_bhojpuri_lang_adapter",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=500,
    learning_rate=5e-5,
    remove_unused_columns=False,
    report_to="none",
    no_cuda=not torch.cuda.is_available(),  # Use CPU if CUDA is not available
)

# === 7. Trainer setup ===
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset.select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [24]:
trainer.train()

Step,Training Loss
50,10.0326
100,7.857
150,6.9813


TrainOutput(global_step=189, training_loss=8.020880805121529, metrics={'train_runtime': 642.086, 'train_samples_per_second': 4.672, 'train_steps_per_second': 0.294, 'total_flos': 19969818624000.0, 'train_loss': 8.020880805121529, 'epoch': 3.0})

In [25]:
model.save_adapter("./indic_bhojpuri_lang_adapter", "indic_bhojpuri_lang_adapter")