In [1]:
# Installations
%uv pip install transformers datasets accelerate evaluate scikit-learn wandb huggingface_hub
%uv pip install seqeval

[2mUsing Python 3.12.6 environment at: /usr/local[0m
[2mAudited [1m7 packages[0m [2min 95ms[0m[0m
Note: you may need to restart the kernel to use updated packages.
[2mUsing Python 3.12.6 environment at: /usr/local[0m
[2mAudited [1m1 package[0m [2min 91ms[0m[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import numpy as np
import torch
from torch import nn
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, 
    TrainingArguments, Trainer, DataCollatorForTokenClassification,
    EarlyStoppingCallback, TrainerCallback
)
from datasets import load_dataset, DatasetDict
import evaluate
from sklearn.metrics import precision_recall_fscore_support, classification_report
from huggingface_hub import login, create_repo, HfApi
import wandb
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Configuration
MODEL_NAME = "IRIIS-RESEARCH/RoBERTa_Nepali_125M"
DATASET_NAME = "DipeshChaudhary/nepali-gector-style-token-level-tag-for-ged"


In [None]:
# HF_TOKEN = ""

In [5]:
# Login to Hugging Face
login(token=HF_TOKEN, add_to_git_credential=True)

Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [6]:
# Hardware setup
NUM_CPUS = 26
BATCH_SIZE = 256 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print(f"üöÄ Using {NUM_CPUS} CPUs")
print(f"üì¶ Batch size: {BATCH_SIZE}")
print(f"ü§ó Binary Model Hub: {BINARY_MODEL_HUB_ID}")
print(f"ü§ó Error Type Model Hub: {ERROR_TYPE_MODEL_HUB_ID}")

# Load dataset and vocabulary
print("üì• Loading dataset and vocabulary...")
dataset = load_dataset(DATASET_NAME)


üöÄ Using 26 CPUs
üì¶ Batch size: 256
ü§ó Binary Model Hub: DipeshChaudhary/nepali-gec-binary-detector
ü§ó Error Type Model Hub: DipeshChaudhary/nepali-gec-error-type-classifier
üì• Loading dataset and vocabulary...


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/45 [00:00<?, ?files/s]

train/data-00000-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00001-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00002-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00003-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00004-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00005-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00006-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00007-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00008-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00009-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00010-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00011-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00012-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00013-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00014-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00015-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00016-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00017-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00018-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00019-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00020-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00021-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00022-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00023-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00024-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00025-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00026-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00027-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00028-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00029-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00030-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00031-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00032-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00033-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00034-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00035-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00036-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00037-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00038-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00039-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00040-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00041-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00042-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00043-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00044-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

validation/data-00000-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00001-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00002-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00003-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00004-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00005-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00006-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00007-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00008-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

test/data-00000-of-00003.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

test/data-00001-of-00003.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

test/data-00002-of-00003.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

In [7]:
# Download vocabulary
from huggingface_hub import hf_hub_download
vocab_path = hf_hub_download(
    repo_id=DATASET_NAME,
    filename="gec_vocabulary.json",
    repo_type="dataset"
)


gec_vocabulary.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

In [8]:
class GECVocabulary:
    def __init__(self, vocab_path):
        with open(vocab_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            self.tag_to_id = data["tag_to_id"]
            self.id_to_tag = {int(k): v for k, v in data["id_to_tag"].items()}
    
    def get_tag_name(self, tag_id):
        return self.id_to_tag.get(tag_id, "$UNKNOWN")
    
    def get_id(self, tag_name):
        return self.tag_to_id.get(tag_name, 9)
    
    def __len__(self):
        return len(self.tag_to_id)

vocabulary = GECVocabulary(vocab_path)
print(f"‚úÖ Loaded vocabulary: {len(vocabulary)} tags")

# Verification Check 1: Dataset Structure
print("\nüîç VERIFICATION 1: Dataset Structure")
print(f"Train: {len(dataset['train']):,} examples")
print(f"Validation: {len(dataset['validation']):,} examples")
print(f"Test: {len(dataset['test']):,} examples")
print(f"Features: {dataset['train'].features}")

# Verification Check 2: Sample Data Inspection
print("\nüîç VERIFICATION 2: Sample Data Inspection")
sample = dataset['train'][0]
print(f"Input IDs length: {len(sample['input_ids'])}")
print(f"Labels length: {len(sample['labels'])}")
print(f"Tag Stats: {sample['tag_stats']}")
print(f"Is Correct: {sample['is_correct']}")

# Create Hub repositories
try:
    create_repo(repo_id=BINARY_MODEL_HUB_ID, repo_type="model", private=True, exist_ok=True)
    create_repo(repo_id=ERROR_TYPE_MODEL_HUB_ID, repo_type="model", private=True, exist_ok=True)
    print("‚úÖ Hub repositories created/verified")
except Exception as e:
    print(f"‚ö†Ô∏è Could not create repos: {e}")

# Custom callback for Hub pushing with verification
class HubPushCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        print(f"üíæ Model saved at step {state.global_step}")
    
    def on_evaluate(self, args, state, control, **kwargs):
        print(f"üìä Evaluation completed at step {state.global_step}")
    
    def on_push_to_hub(self, args, state, control, **kwargs):
        print(f"üöÄ Model pushed to Hub at step {state.global_step}")


‚úÖ Loaded vocabulary: 10 tags

üîç VERIFICATION 1: Dataset Structure
Train: 13,008,711 examples
Validation: 2,439,231 examples
Test: 813,050 examples
Features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64')), 'is_correct': Value('bool'), 'tag_stats': Value('string')}

üîç VERIFICATION 2: Sample Data Inspection
Input IDs length: 128
Labels length: 128
Tag Stats: {"replace": 1, "keep": 9}
Is Correct: False
‚úÖ Hub repositories created/verified


In [12]:
print("üîÑ Converting to binary labels with correct batched processing...")

def convert_to_binary_labels_batched(batch):
    """Correct batched processing - returns lists of lists"""
    all_binary_labels = []
    for labels in batch['labels']:
        binary_labels = []
        for label in labels:
            if label == -100:  # Keep special tokens
                binary_labels.append(-100)
            elif label == 0:   # KEEP -> 0
                binary_labels.append(0)
            else:              # Any error -> 1
                binary_labels.append(1)
        all_binary_labels.append(binary_labels)
    return {'binary_labels': all_binary_labels}

# Convert with correct batched processing
binary_dataset = dataset.map(
    convert_to_binary_labels_batched,
    batched=True,
    batch_size=10000,
    num_proc=25,
    load_from_cache_file=False,
    desc="Converting to binary labels"
)


üîÑ Converting to binary labels with correct batched processing...


Converting to binary labels (num_proc=25):   0%|          | 0/13008711 [00:00<?, ? examples/s]

Converting to binary labels (num_proc=25):   0%|          | 0/2439231 [00:00<?, ? examples/s]

Converting to binary labels (num_proc=25):   0%|          | 0/813050 [00:00<?, ? examples/s]

In [13]:
binary_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats', 'binary_labels'],
        num_rows: 13008711
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats', 'binary_labels'],
        num_rows: 2439231
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats', 'binary_labels'],
        num_rows: 813050
    })
})

In [15]:
binary_dataset['train']['labels'][2]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 4,
 0,
 0,
 5,
 0,
 5,
 0,
 0,
 2,
 0,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [16]:
binary_dataset['train']['binary_labels'][2]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [17]:
# VERIFICATION: Check if it worked correctly
print("\nüîç VERIFYING BINARY CONVERSION...")
first_example = binary_dataset['train'][0]
print(f"Type of binary_labels: {type(first_example['binary_labels'])}")
print(f"Length of binary_labels: {len(first_example['binary_labels'])}")
print(f"First 10 binary labels: {first_example['binary_labels'][:10]}")
print(f"First 10 original labels: {first_example['labels'][:10]}")

# Quick check of distribution
print("\nüîç BINARY LABEL DISTRIBUTION (First 1000 examples)")
keep_count = 0
error_count = 0

for i in range(1000):
    example = binary_dataset['train'][i]
    for label in example['binary_labels']:
        if label == 0:
            keep_count += 1
        elif label == 1:
            error_count += 1

total = keep_count + error_count
print(f"KEEP (0): {keep_count:,} ({keep_count/total*100:.2f}%)")
print(f"ERROR (1): {error_count:,} ({error_count/total*100:.2f}%)")


üîç VERIFYING BINARY CONVERSION...
Type of binary_labels: <class 'list'>
Length of binary_labels: 128
First 10 binary labels: [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
First 10 original labels: [0, 0, 0, 0, 0, 0, 0, 0, 2, 0]

üîç BINARY LABEL DISTRIBUTION (First 1000 examples)
KEEP (0): 14,159 (94.63%)
ERROR (1): 803 (5.37%)


In [18]:
binary_dataset.push_to_hub("DipeshChaudhary/binary-nepali-ged-dataset")

Uploading the dataset shards:   0%|          | 0/72 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   6%|6         |  543kB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.93MB / 8.75MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.82MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.85MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   6%|6         |  548kB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.91MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.91MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.71MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.91MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   6%|6         |  548kB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.91MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.83MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.76MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.75MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.75MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.91MB / 8.75MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.84MB / 8.72MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.80MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.80MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.72MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.82MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.88MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.87MB / 8.72MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.91MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.72MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####4     | 3.91MB / 8.74MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.95MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.80MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####4     | 3.85MB / 8.73MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  44%|####3     | 3.84MB / 8.73MB            

Uploading the dataset shards:   0%|          | 0/14 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.84MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.84MB / 8.45MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  46%|####6     | 3.88MB / 8.43MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  46%|####5     | 3.84MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.84MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.84MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  46%|####5     | 3.84MB / 8.43MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.80MB / 8.43MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.84MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.81MB / 8.43MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  45%|####5     | 3.84MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  46%|####6     | 3.92MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  46%|####6     | 3.90MB / 8.44MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   6%|6         |  548kB / 8.44MB            

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  49%|####8     | 3.84MB / 7.89MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  49%|####8     | 3.85MB / 7.89MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  49%|####8     | 3.84MB / 7.88MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  49%|####9     | 3.89MB / 7.89MB            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  49%|####8     | 3.84MB / 7.89MB            

CommitInfo(commit_url='https://huggingface.co/datasets/DipeshChaudhary/binary-nepali-ged-dataset/commit/0477cd3dea6c4e92fe96834b36563e156d86b75d', commit_message='Upload dataset', commit_description='', oid='0477cd3dea6c4e92fe96834b36563e156d86b75d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DipeshChaudhary/binary-nepali-ged-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DipeshChaudhary/binary-nepali-ged-dataset'), pr_revision=None, pr_num=None)

In [46]:
# Load model for binary classification
print("üì• Loading model for binary classification...")
binary_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
binary_model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: "KEEP", 1: "ERROR"},
    label2id={"KEEP": 0, "ERROR": 1}
)
print(f"‚úÖ Binary model loaded with {binary_model.num_parameters():,} parameters")

üì• Loading model for binary classification...


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at IRIIS-RESEARCH/RoBERTa_Nepali_125M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Binary model loaded with 124,049,666 parameters


In [None]:
binary_dataset.push_to_hub("DipeshChaudhary/binary-nepali-ged-dataset")

### ===========================================
### ERROR TYPE CLASSIFIER PREPARATION
### ============================================


## trying another things start