In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification  # Changed to Fast tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.auto import tqdm
from huggingface_hub import HfFolder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set your HuggingFace token
HF_TOKEN = "secrect"  # Replace with your token
HfFolder.save_token(HF_TOKEN)

In [None]:
from huggingface_hub import login

# Login with the token
login(token="secret")

In [None]:
def calculate_max_length(file_path, tokenizer):
    """Calculate the 99th percentile of token lengths in the dataset"""
    print("Analyzing sequence lengths...")
    df = pd.read_csv(file_path)
    sentences = df.groupby('Sentence #')['Word'].apply(list).values

    lengths = []
    for sentence in tqdm(sentences, desc="Calculating sequence lengths"):
        tokens = tokenizer(sentence, is_split_into_words=True)
        lengths.append(len(tokens['input_ids']))

    max_len = int(np.percentile(lengths, 99))
    print(f"Sequence length statistics:")
    print(f"Mean length: {np.mean(lengths):.2f}")
    print(f"Median length: {np.median(lengths):.2f}")
    print(f"99th percentile length: {max_len}")
    print(f"Max length: {max(lengths)}")

    return max_len


In [None]:
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, tag_to_id, max_len):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.tag_to_id = tag_to_id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.texts[idx]
        tags = self.tags[idx]

        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        label_ids = []
        word_ids = encoding.word_ids()

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(self.tag_to_id[tags[word_idx]])

        encoding = {key: val.squeeze() for key, val in encoding.items()}
        encoding['labels'] = torch.tensor(label_ids)

        return encoding

In [None]:
# Define tag dictionary (same as training)
tag_to_id = {
    'B-geo': 0, 'O': 1, 'B-gpe': 2, 'B-per': 3, 'I-per': 4, 'B-tim': 5,
    'B-org': 6, 'I-org': 7, 'B-art': 8, 'I-art': 9, 'I-tim': 10,
    'B-eve': 11, 'I-eve': 12, 'I-geo': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16
}
id_to_tag = {v: k for k, v in tag_to_id.items()}

In [None]:
# Initialize tokenizer first to calculate max_length
print("Initializing tokenizer...")
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large')  # Changed to Fast tokenizer

data_path = '/content/drive/MyDrive/NER_Dataset/b-ner-train.csv'
# Calculate max_length from dataset
max_len = calculate_max_length(data_path, tokenizer)
print(f"Using max_length = {max_len}")

# Load data
print("Loading data...")
#df = pd.read_csv(data_path,nrows=100)
df = pd.read_csv(data_path)
sentences = df.groupby('Sentence #')['Word'].apply(list).values
tags = df.groupby('Sentence #')['Tag'].apply(list).values

# Split data
train_texts, val_texts, train_tags, val_tags = train_test_split(
    sentences, tags, test_size=0.2, random_state=42
)

print("Initializing model...")
model = XLMRobertaForTokenClassification.from_pretrained(
    'xlm-roberta-large',
    num_labels=len(tag_to_id)
)

# Create datasets with calculated max_length
train_dataset = NERDataset(train_texts, train_tags, tokenizer, tag_to_id, max_len)
val_dataset = NERDataset(val_texts, val_tags, tokenizer, tag_to_id, max_len)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Training settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


Initializing tokenizer...
Analyzing sequence lengths...


Calculating sequence lengths:   0%|          | 0/17715 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors


Sequence length statistics:
Mean length: 26.82
Median length: 23.00
99th percentile length: 91
Max length: 599
Using max_length = 91
Loading data...
Initializing model...


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda




In [None]:

# Training loop
print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    train_pbar = tqdm(train_loader,
                     desc=f'Epoch {epoch + 1}/{num_epochs} [Train]',
                     leave=True)

    for batch in train_pbar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    # Validation
    model.eval()
    val_loss = 0
    predictions = []
    true_labels = []

    val_pbar = tqdm(val_loader,
                   desc=f'Epoch {epoch + 1}/{num_epochs} [Validate]',
                   leave=True)

    with torch.no_grad():
        for batch in val_pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            val_loss += outputs.loss.item()

            predictions.extend(outputs.logits.argmax(dim=2).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            val_pbar.set_postfix({'loss': f'{outputs.loss.item():.4f}'})

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    print(f'\nEpoch {epoch + 1} Summary:')
    print(f'Average training loss: {avg_train_loss:.4f}')
    print(f'Average validation loss: {avg_val_loss:.4f}\n')


Starting training...


Epoch 1/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 1/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 1 Summary:
Average training loss: 0.2461
Average validation loss: 0.1592



Epoch 2/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 2/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 2 Summary:
Average training loss: 0.1176
Average validation loss: 0.1320



Epoch 3/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 3/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 3 Summary:
Average training loss: 0.0849
Average validation loss: 0.1199



Epoch 4/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 4/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 4 Summary:
Average training loss: 0.0633
Average validation loss: 0.1234



Epoch 5/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 5/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 5 Summary:
Average training loss: 0.0492
Average validation loss: 0.1269



Epoch 6/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 6/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 6 Summary:
Average training loss: 0.0362
Average validation loss: 0.1443



Epoch 7/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 7/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 7 Summary:
Average training loss: 0.0291
Average validation loss: 0.1415



Epoch 8/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 8/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 8 Summary:
Average training loss: 0.0224
Average validation loss: 0.1511



Epoch 9/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 9/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 9 Summary:
Average training loss: 0.0170
Average validation loss: 0.1602



Epoch 10/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Epoch 10/10 [Validate]:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 10 Summary:
Average training loss: 0.0134
Average validation loss: 0.1665



In [None]:
# Save locally
print("Saving model locally...")
output_dir = "bengali-ner-model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


Saving model locally...


('bengali-ner-model/tokenizer_config.json',
 'bengali-ner-model/special_tokens_map.json',
 'bengali-ner-model/sentencepiece.bpe.model',
 'bengali-ner-model/added_tokens.json',
 'bengali-ner-model/tokenizer.json')

In [None]:
# Push to Hub
print("Uploading model to Hugging Face Hub...")
model.push_to_hub("Debk/Ben_NER_xlm-roberta-large", token=HF_TOKEN)
tokenizer.push_to_hub("Debk/Ben_NER_xlm-roberta-large", token=HF_TOKEN)
print("Model uploaded successfully!")

Uploading model to Hugging Face Hub...


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Model uploaded successfully!
