In [1]:
# Installations
!pip install transformers datasets accelerate evaluate scikit-learn huggingface_hub
!pip install seqeval



In [2]:
import os
import json
import numpy as np
import torch
from torch import nn
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, 
    TrainingArguments, Trainer, DataCollatorForTokenClassification,
    EarlyStoppingCallback, TrainerCallback
)
from datasets import load_dataset, DatasetDict
import evaluate
from sklearn.metrics import precision_recall_fscore_support, classification_report
from huggingface_hub import login, create_repo, HfApi
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Configuration
MODEL_NAME = "IRIIS-RESEARCH/RoBERTa_Nepali_125M"
DATASET_NAME = "DipeshChaudhary/binary-nepali-ged-dataset"
BINARY_MODEL_HUB_ID = "DipeshChaudhary/nepali-gec-binary-detector"


In [5]:
# HF_TOKEN = ""

In [None]:
# Login to Hugging Face
login(token=HF_TOKEN, add_to_git_credential=True)

Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [7]:
# Hardware setup
NUM_CPUS = 30
BATCH_SIZE = 1024 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print(f"ðŸš€ Using {NUM_CPUS} CPUs")
print(f"ðŸ“¦ Batch size: {BATCH_SIZE}")
print(f"ðŸ¤— Binary Model Hub: {BINARY_MODEL_HUB_ID}")

# Load dataset and vocabulary
print("ðŸ“¥ Loading dataset and vocabulary...")
dataset = load_dataset(DATASET_NAME)


ðŸš€ Using 30 CPUs
ðŸ“¦ Batch size: 1024
ðŸ¤— Binary Model Hub: DipeshChaudhary/nepali-gec-binary-detector
ðŸ“¥ Loading dataset and vocabulary...


Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/72 [00:00<?, ?it/s]

In [8]:
os.environ["WANDB_DISABLED"] = "true"

## compute metrics for binary classification


In [9]:
def compute_binary_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored indices
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    # Flatten
    flat_predictions = [p for sublist in true_predictions for p in sublist]
    flat_labels = [l for sublist in true_labels for l in sublist]
    
    # Calculate metrics
    accuracy = (np.array(flat_predictions) == np.array(flat_labels)).mean()
    precision, recall, f1, _ = precision_recall_fscore_support(
        flat_labels, flat_predictions, average='binary', zero_division=0
    )
    
    # Sentence-level accuracy
    sentence_correct = 0
    for pred_sentence, true_sentence in zip(true_predictions, true_labels):
        if all(p == t for p, t in zip(pred_sentence, true_sentence)):
            sentence_correct += 1
    sentence_accuracy = sentence_correct / len(true_predictions)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'sentence_accuracy': sentence_accuracy,
    }


### ===========================================
### STAGE 1: BINARY ERROR DETECTOR
### ============================================


In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats', 'binary_labels'],
        num_rows: 13008711
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats', 'binary_labels'],
        num_rows: 2439231
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats', 'binary_labels'],
        num_rows: 813050
    })
})

In [12]:
dataset['train']['labels'][2]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 4,
 0,
 0,
 5,
 0,
 5,
 0,
 0,
 2,
 0,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [13]:
# Load model for binary classification
print("ðŸ“¥ Loading model for binary classification...")
binary_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
binary_model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: "KEEP", 1: "ERROR"},
    label2id={"KEEP": 0, "ERROR": 1}
)
print(f"âœ… Binary model loaded with {binary_model.num_parameters():,} parameters")

ðŸ“¥ Loading model for binary classification...


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at IRIIS-RESEARCH/RoBERTa_Nepali_125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


âœ… Binary model loaded with 124,049,666 parameters


In [14]:
# Prepare dataset for binary training
print("ðŸ”„ Preparing binary training dataset...")
binary_training_dataset = dataset.remove_columns(['labels', 'tag_stats', 'is_correct']).rename_column('binary_labels', 'labels')

print("âœ… Dataset prepared for binary training:")
print(f"   Features: {binary_training_dataset['train'].features}")


ðŸ”„ Preparing binary training dataset...
âœ… Dataset prepared for binary training:
   Features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64'))}


In [15]:
# Data collator
binary_data_collator = DataCollatorForTokenClassification(
    tokenizer=binary_tokenizer,
    padding=True,
    max_length=128,
    pad_to_multiple_of=8
)


In [16]:

# Training arguments for binary model
binary_training_args = TrainingArguments(
    output_dir="./binary-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1024,
    per_device_eval_batch_size=1024,
    gradient_accumulation_steps=1,
    learning_rate=2e-6,
    warmup_steps=500,
    weight_decay=0.01,
    max_grad_norm=1.0,
    
    # Evaluation & Saving
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    logging_steps=1000,
    save_total_limit=3,
    
    # Hub Uploading
    push_to_hub=True,
    hub_model_id=BINARY_MODEL_HUB_ID,
    hub_strategy="every_save",
    hub_token=HF_TOKEN,
    
    # Optimization
    dataloader_num_workers=28,
    dataloader_pin_memory=True,
    fp16=True,
    tf32=True,
    
    # Metrics
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    report_to=None,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
binary_trainer = Trainer(
    model=binary_model,
    args=binary_training_args,
    train_dataset=binary_training_dataset["train"],
    eval_dataset=binary_training_dataset["validation"],
    data_collator=binary_data_collator,
    tokenizer=binary_tokenizer,
    compute_metrics=compute_binary_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

In [18]:
print("ðŸš€ Training binary error detector...")
print(f"Training on {len(binary_training_dataset['train']):,} examples")
print(f"Validating on {len(binary_training_dataset['validation']):,} examples")

binary_train_result = binary_trainer.train()

ðŸš€ Training binary error detector...
Training on 13,008,711 examples
Validating on 2,439,231 examples


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Sentence Accuracy
1000,0.1917,0.114422,0.966759,0.899116,0.434436,0.585816,0.734694
2000,0.1083,0.095801,0.970838,0.898578,0.519733,0.658559,0.762544
3000,0.0927,0.079063,0.974486,0.883899,0.608409,0.720725,0.777227
4000,0.0807,0.068623,0.977963,0.890352,0.676004,0.768512,0.801488
5000,0.0726,0.062333,0.980018,0.890991,0.718658,0.7956,0.81787
6000,0.0671,0.058848,0.981338,0.900091,0.73692,0.810373,0.828599
7000,0.0637,0.055934,0.98229,0.901672,0.75506,0.821879,0.836636
8000,0.0608,0.054222,0.983023,0.912832,0.758702,0.828661,0.843412
9000,0.0588,0.052013,0.983631,0.913476,0.770478,0.835906,0.848854
10000,0.0569,0.050747,0.984123,0.919077,0.774808,0.840799,0.853199


In [19]:

binary_trainer.save_model()
binary_trainer.push_to_hub()

print("âœ… Binary model training completed!")
print(f"Final binary metrics: {binary_train_result.metrics}")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


âœ… Binary model training completed!
Final binary metrics: {'train_runtime': 21179.3512, 'train_samples_per_second': 1842.65, 'train_steps_per_second': 1.799, 'total_flos': 2.5493504952119455e+18, 'train_loss': 0.05747698721617435, 'epoch': 3.0}


In [20]:


# Evaluate on test set
print("ðŸ§ª Evaluating binary model on test set...")
binary_test_results = binary_trainer.evaluate(binary_training_dataset["test"], metric_key_prefix="test")

print("\nðŸ“Š BINARY MODEL TEST RESULTS:")
for key, value in binary_test_results.items():
    if "test_" in key and "loss" not in key:
        print(f"  {key}: {value:.4f}")

ðŸ§ª Evaluating binary model on test set...


early stopping required metric_for_best_model, but did not find eval_f1 so early stopping is disabled



ðŸ“Š BINARY MODEL TEST RESULTS:
  test_accuracy: 0.9874
  test_precision: 0.9344
  test_recall: 0.8247
  test_f1: 0.8761
  test_sentence_accuracy: 0.8831
  test_runtime: 104.0855
  test_samples_per_second: 7811.3630
  test_steps_per_second: 7.6280
