In [None]:
import torch
from transformers import AutoModel

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print(
    "GPU Name:",
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU",
)               
device = torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch Version: 2.5.1+cu121
CUDA Available: True
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    AutoConfig,
    AutoModelForMaskedLM,
)
from adapters import AutoAdapterModel, AdapterConfig, AdapterTrainer




In [None]:
dataset_path = "Bohjpuri_unlabed.txt"

with open(dataset_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
text_data = [line.strip() for line in lines if line.strip()]
dataset = Dataset.from_dict({"text": text_data})

In [None]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
from adapters import AutoAdapterModel
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "xlm-roberta-base",
)
model = AutoAdapterModel.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from adapters import AutoAdapterModel
from adapters import AdapterConfig

lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
model.add_adapter("bhojpuri_lang_adapter", config=lang_adapter_config)
model.train_adapter("bhojpuri_lang_adapter")

There are adapters available but none are activated for the forward pass.


In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer)) 
def tokenize_function(examples):
    return tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=128
    )
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)

Map: 100%|██████████| 15590/15590 [00:00<00:00, 20962.03 examples/s]


In [10]:
max_token_id = 0
for example in tokenized_dataset:
    for input_id in example["input_ids"]:
        if input_id > max_token_id:
            max_token_id = input_id
print(f"Maximum token ID in dataset: {max_token_id}")

Maximum token ID in dataset: 243771


In [12]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# === 6. Training arguments ===
training_args = TrainingArguments(
    output_dir="./bhojpuri_lang_adapter",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=500,
    learning_rate=1e-5,
    remove_unused_columns=False,
    report_to="none",

)

# === 7. Trainer setup ===
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset.select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
model.save_adapter("./bhojpuri_lang_adapter", "bhojpuri_lang_adapter")