# NEPALI GEC DATASET PROCESSOR
#
# This script converts a raw Nepali GEC dataset into a token-level
# (subword) dataset with 10 robust GEC tags.
#
# This uses a strict multi-pass priority system to
# correctly tag all operations, including complex non-difflib-aligned
# edits like SWAP, MERGE, and SPLIT.

In [1]:
%uv pip install transformers datasets accelerate evaluate scikit-learn wandb huggingface_hub

[2mUsing Python 3.12.6 environment at: /usr/local[0m
[2mAudited [1m7 packages[0m [2min 37ms[0m[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import gc
import json
import os
import shutil
import time
import difflib
from collections import defaultdict, Counter
from typing import List, Dict, Tuple
from multiprocessing import cpu_count

import datasets
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoTokenizer
from huggingface_hub import HfApi, create_repo, login

In [4]:
# HF_TOKEN = "" 

In [5]:
MODEL_NAME = "IRIIS-RESEARCH/RoBERTa_Nepali_125M"
MAX_SEQUENCE_LENGTH = 128
HF_USERNAME = "DipeshChaudhary"
RAW_DATASET_NAME = "sumitaryal/nepali_grammatical_error_correction"
FINAL_DATASET_NAME = "nepali-gector-style-token-level-tag-for-ged" 
REPO_ID = f"{HF_USERNAME}/{FINAL_DATASET_NAME}"
LOCAL_DATA_PATH = "./nepali-gector-style-token-level-tag-for-ged"
VOCAB_FILENAME = "gec_vocabulary.json"
NUM_WORKERS = max(1, cpu_count() - 2)

In [6]:
print(f"--- NEPALI GEC DATASET PROCESSOR ---")
print(f"Model: {MODEL_NAME}")
print(f"Max Seq Length: {MAX_SEQUENCE_LENGTH}")
print(f"Workers: {NUM_WORKERS}")
print(f"Output Repo: {REPO_ID}")
print(f"Local Path: {LOCAL_DATA_PATH}")
print("-" * 70 + "\n")

--- NEPALI GEC DATASET PROCESSOR ---
Model: IRIIS-RESEARCH/RoBERTa_Nepali_125M
Max Seq Length: 128
Workers: 106
Output Repo: DipeshChaudhary/nepali-gector-style-token-level-tag-for-ged
Local Path: ./nepali-gector-style-token-level-tag-for-ged
----------------------------------------------------------------------



In [7]:
class EnhancedNepaliGECVocabulary:
    """Defines the mapping between GEC tag names and their integer IDs."""
    def __init__(self):
        self.KEEP_ID = 0
        self.DELETE_ID = 1
        self.REPLACE_ID = 2
        self.APPEND_ID = 3
        self.SWAP_NEXT_ID = 4
        self.SWAP_PREV_ID = 5
        self.MERGE_NEXT_ID = 6
        self.MERGE_PREV_ID = 7
        self.SPLIT_ID = 8
        self.UNKNOWN_ID = 9
        
        self.tag_to_id = {
            "$KEEP": self.KEEP_ID, "$DELETE": self.DELETE_ID,
            "$REPLACE": self.REPLACE_ID, "$APPEND": self.APPEND_ID,
            "$SWAP_NEXT": self.SWAP_NEXT_ID, "$SWAP_PREV": self.SWAP_PREV_ID,
            "$MERGE_NEXT": self.MERGE_NEXT_ID, "$MERGE_PREV": self.MERGE_PREV_ID,
            "$SPLIT": self.SPLIT_ID, "$UNKNOWN": self.UNKNOWN_ID
        }
        self.id_to_tag = {v: k for k, v in self.tag_to_id.items()}
    
    def get_tag_name(self, tag_id: int) -> str:
        return self.id_to_tag.get(tag_id, "$UNKNOWN")
    
    def get_id(self, tag_name: str) -> int:
        return self.tag_to_id.get(tag_name, self.UNKNOWN_ID)
    
    def vocab_size(self) -> int:
        return len(self.tag_to_id)
    
    def save(self, filepath: str):
        data = {
            "tag_to_id": self.tag_to_id,
            "id_to_tag": {int(k): v for k, v in self.id_to_tag.items()}
        }
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"✓ Vocabulary saved: {filepath}")

# --- ALIGNMENT & TAGGING FUNCTIONS ---


In [8]:
def calculate_levenshtein_opcodes(incorrect_words: List[str], correct_words: List[str]) -> List[Tuple[str, int, int, int, int]]:
    """Calculates Levenshtein opcodes (tag, i1, i2, j1, j2) using difflib."""
    s = difflib.SequenceMatcher(None, incorrect_words, correct_words, autojunk=False)
    return s.get_opcodes()

def generate_word_level_tags(incorrect_words: List[str], 
                             correct_words: List[str],
                             vocabulary: EnhancedNepaliGECVocabulary) -> Tuple[List[int], Dict]:
    """
    Generates robust, word-level GEC tags using a strict multi-pass 
    priority system (V15 logic).
    
    Priority Order:
    1. SWAP (Content-based)
    2. MERGE (Content-based)
    3. Opcode-based (APPEND, SPLIT, DELETE, REPLACE, KEEP)
    """
    opcodes = calculate_levenshtein_opcodes(incorrect_words, correct_words)
    tags = [vocabulary.KEEP_ID] * len(incorrect_words)
    
    # --- PASS 1: GLOBAL SWAP Detection (Highest Priority) ---
    # This pass ignores opcodes and tags based on content match.
    i1 = 0
    while i1 < len(incorrect_words):
        if tags[i1] != vocabulary.KEEP_ID: i1 += 1; continue
        i2 = i1 + 1
        while i2 < len(incorrect_words):
            if tags[i2] != vocabulary.KEEP_ID: i2 += 1; continue
            
            j1, j2 = -1, -1
            try:
                # Find inc[i1] in correct sentence
                j2 = correct_words.index(incorrect_words[i1]) 
                # Find inc[i2] in correct sentence
                j1 = correct_words.index(incorrect_words[i2]) 
            except ValueError:
                pass # One or both words not in correct sentence

            # CRITICAL SWAP CONDITION:
            # inc[i1] is at cor[j2]
            # inc[i2] is at cor[j1]
            # ... and their order is reversed (j1 < j2)
            if j1 != -1 and j2 != -1 and j1 < j2:
                tags[i1] = vocabulary.SWAP_NEXT_ID
                tags[i2] = vocabulary.SWAP_PREV_ID
                break # Found swap for i1, break inner loop
            i2 += 1
        i1 += 1

    # --- PASS 2: MERGE Detection (Second Highest Priority) ---
    # This pass also ignores opcodes and tags based on content.
    i = 0
    while i < len(incorrect_words) - 1:
        # Check if current and next word are untagged
        if tags[i] == vocabulary.KEEP_ID and tags[i+1] == vocabulary.KEEP_ID:
            merged_inc_word = incorrect_words[i] + incorrect_words[i+1]
            # Check if this merged word exists in the correct sentence
            if merged_inc_word in correct_words:
                tags[i] = vocabulary.MERGE_NEXT_ID
                tags[i+1] = vocabulary.MERGE_PREV_ID
                i += 2 # Skip both words
                continue
        i += 1

    # --- PASS 3: Opcode Loop (SPLIT, DELETE, REPLACE, APPEND, KEEP) ---
    # This pass applies tags ONLY if a higher-priority tag (SWAP, MERGE)
    # has not already been set.
    
    # We must process opcodes in a specific order:
    # 1. Insertions (for APPEND)
    # 2. Equal/Replace/Delete (for all other tags)
    
    # Process Insertions first to fix the Append-Overwrite bug
    for op, i_start, i_end, c_start, c_end in opcodes:
        if op == 'insert':
            target_idx = max(0, i_start - 1)
            # Only tag if the target is valid and is currently KEEP
            if 0 <= target_idx < len(tags) and tags[target_idx] == vocabulary.KEEP_ID: 
                 tags[target_idx] = vocabulary.APPEND_ID

    # Process all other operations
    for op, i_start, i_end, c_start, c_end in opcodes:
        if op == 'insert':
            continue # Already handled

        # Iterate over the affected index range in incorrect_words
        for idx in range(i_start, i_end):
            # CRITICAL BUG FIX:
            # Do NOT overwrite a tag from a higher-priority pass
            if tags[idx] != vocabulary.KEEP_ID:
                continue

            # Now, apply the tag based on the opcode
            if op == 'equal':
                tags[idx] = vocabulary.KEEP_ID
            
            elif op == 'delete':
                tags[idx] = vocabulary.DELETE_ID
            
            elif op == 'replace':
                inc_len, cor_len = i_end - i_start, c_end - c_start
                # Check for SPLIT
                if inc_len == 1 and cor_len > 1:
                    inc_word = incorrect_words[i_start]
                    cor_words_merged = "".join(correct_words[c_start:c_end])
                    if inc_word == cor_words_merged:
                        tags[idx] = vocabulary.SPLIT_ID
                        continue # Skip the general replace below
                
                # General Replace
                tags[idx] = vocabulary.REPLACE_ID

    # --- PASS 4: Final Stat Recalculation ---
    final_stats = Counter([vocabulary.get_tag_name(t).replace('$', '').lower() for t in tags if t != vocabulary.KEEP_ID])
    final_stats['keep'] = tags.count(vocabulary.KEEP_ID)

    return tags, dict(final_stats)

In [9]:
tokenizer = None
vocabulary = None

def preprocess_function(examples: Dict) -> Dict:
    """
    Processes a batch of examples, generating WORD-level tags
    and mapping them to SUBWORD (token-level) labels for the model.
    """
    incorrect_texts = [str(s).strip() for s in examples["incorrect_sentence"]]
    correct_texts = [str(s).strip() for s in examples["correct_sentence"]]

    all_input_ids = []
    all_attention_masks = []
    all_labels = []
    all_is_correct = []
    all_tag_stats = []
    all_stratify_keys = []

    for inc_text, cor_text in zip(incorrect_texts, correct_texts):
        
        dominant_tag = vocabulary.KEEP_ID # Default stratification key
        
        # --- 1. Process INCORRECT sentence ---
        if inc_text and inc_text != cor_text and len(inc_text) > 1 and len(cor_text) > 1:
            
            inc_words = inc_text.split()
            cor_words = cor_text.split()
            
            # Skip if splitting results in empty lists
            if not inc_words or not cor_words:
                continue
                
            word_tags, stats = generate_word_level_tags(inc_words, cor_words, vocabulary)
            
            encoding = tokenizer(
                inc_text,
                padding="max_length",
                truncation=True,
                max_length=MAX_SEQUENCE_LENGTH,
                return_special_tokens_mask=True
            )
            
            labels = []
            word_ids = encoding.word_ids()
            
            previous_word_idx = None
            for word_idx in word_ids:
                if word_idx is None:
                    labels.append(-100)
                elif word_idx < len(word_tags):
                    if word_idx != previous_word_idx:
                        labels.append(word_tags[word_idx])
                    else:
                        labels.append(vocabulary.KEEP_ID)
                else:
                    labels.append(-100)
                
                previous_word_idx = word_idx

            if stats:
                error_tags = {k: v for k, v in stats.items() if k != 'keep'}
                if error_tags:
                    dominant_tag_name = max(error_tags, key=error_tags.get)
                    dominant_tag = vocabulary.get_id(f"${dominant_tag_name.upper()}")
            
            all_input_ids.append(encoding['input_ids'])
            all_attention_masks.append(encoding['attention_mask'])
            all_labels.append(labels)
            all_is_correct.append(False)
            all_tag_stats.append(json.dumps(stats))
            all_stratify_keys.append(str(dominant_tag))

        # --- 2. Process CORRECT sentence ---
        if cor_text and len(cor_text) > 1:
            encoding = tokenizer(
                cor_text,
                padding="max_length",
                truncation=True,
                max_length=MAX_SEQUENCE_LENGTH
            )
            labels = []
            keep_count = 0
            word_ids = encoding.word_ids()
            for word_id in word_ids:
                if word_id is None:
                    labels.append(-100)
                else:
                    labels.append(vocabulary.KEEP_ID)
                    keep_count += 1
            
            all_input_ids.append(encoding['input_ids'])
            all_attention_masks.append(encoding['attention_mask'])
            all_labels.append(labels)
            all_is_correct.append(True)
            all_tag_stats.append(json.dumps({"keep": keep_count}))
            all_stratify_keys.append(str(vocabulary.KEEP_ID)) # Stratify key for correct
            
    # Return batch
    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels,
        "is_correct": all_is_correct,
        "tag_stats": all_tag_stats,
        "stratify_key": all_stratify_keys
    }

In [10]:
def run_pipeline():
    """Executes the full dataset processing pipeline."""

    print("=" * 70)
    print("STEP 1: INITIALIZING TOKENIZER & VOCABULARY")
    print("=" * 70)
    
    global tokenizer, vocabulary
    try:
        # Set TOKENIZERS_PARALLELISM to false to avoid warnings
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        vocabulary = EnhancedNepaliGECVocabulary()
    except Exception as e:
        print(f"CRITICAL ERROR: Could not load tokenizer '{MODEL_NAME}'. {e}")
        print("Please ensure 'transformers' is installed and you have internet access.")
        return

    print(f"✓ Tokenizer loaded: {MODEL_NAME}")
    print(f"✓ Vocabulary loaded: {vocabulary.vocab_size()} tags")
    print(f"Tags: {list(vocabulary.tag_to_id.keys())}\n")


    print("=" * 70)
    print("STEP 2: VERIFYING TAGGING LOGIC (COMPREHENSIVE TEST)")
    print("=" * 70)
    
    # Comprehensive Test Cases (Nepali)
    test_cases_comprehensive = [
        ("Merge", "म खाना खाँदै छु", "म खाना खाँदैछु"), 
        ("Split", "तपाईंको नामके हो", "तपाईंको नाम के हो"),
        ("Swap_Adj", "खाना म खान्छु", "म खाना खान्छु"), 
        ("Swap_Complex", "सोध्न केही छ मलाई", "मलाई सोध्न केही छ"),
        ("Append (End)", "म कलेज जान्छु", "म कलेज जान्छु आज"),
        ("Append (Start)", "ऊ घर गयो", "आज ऊ घर गयो"),
        ("Delete", "यो किताब धेरै राम्रो छ", "यो किताब राम्रो छ"),
        ("Replace", "मैले काम गरे", "मैले काम गरें"),
        ("Neg_DR (No Merge)", "यो रातो टोपी हो", "यो नीलो टोपी हो"),
    ]

    emojis = {
        vocabulary.KEEP_ID: "✅", vocabulary.DELETE_ID: "🗑️",
        vocabulary.REPLACE_ID: "🔄", vocabulary.APPEND_ID: "➕",
        vocabulary.SWAP_NEXT_ID: "↔️", vocabulary.SWAP_PREV_ID: "↩️",
        vocabulary.MERGE_NEXT_ID: "↘️", vocabulary.MERGE_PREV_ID: "↙️",
        vocabulary.SPLIT_ID: "✄", vocabulary.UNKNOWN_ID: "❓",
        -100: "⬛" # Special token ID
    }
    
    all_tests_passed = True
    for case_type, incorrect, correct in test_cases_comprehensive:
        print(f"\n--- Test Case: {case_type} ---")
        inc_words = incorrect.split()
        cor_words = correct.split()
        word_tags, stats = generate_word_level_tags(inc_words, cor_words, vocabulary)
        
        print(f"  Input:    '{incorrect}'")
        print(f"  Correct:  '{correct}'")
        print(f"  Tags (Word-Level): ", end="")
        for i, (word, tag_id) in enumerate(zip(inc_words, word_tags)):
            tag_name = vocabulary.get_tag_name(tag_id)
            print(f"[{emojis.get(tag_id, '❓')} {tag_name} '{word}'] ", end="")
        print(f"\n  Stats:    {stats}")
        
        # --- ADDED: Token-Level Verification ---
        print(f"\n  Token-Level Mapping (Verification):")
        
        encoding = tokenizer(
            incorrect,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True
        )
        subword_tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
        word_ids = encoding.word_ids()
        
        subword_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                subword_labels.append(-100)
            elif word_idx < len(word_tags):
                if word_idx != previous_word_idx:
                    subword_labels.append(word_tags[word_idx])
                else:
                    subword_labels.append(vocabulary.KEEP_ID)
            else:
                subword_labels.append(-100)
            previous_word_idx = word_idx
            
        print(f"    {'Token'.ljust(15)} {'Word ID'.ljust(10)} {'Final Label'.ljust(20)}")
        print(f"    {'-'*15} {'-'*10} {'-'*20}")
        
        for token, word_id, label_id in zip(subword_tokens, word_ids, subword_labels):
            tag_name = vocabulary.get_tag_name(label_id) if label_id != -100 else "N/A"
            emoji_tag = f"{emojis.get(label_id, '❓')} {tag_name}"
            print(f"    {token.ljust(15)} {str(word_id).ljust(10)} {emoji_tag.ljust(20)}")
        # --- END: Token-Level Verification ---

        # Simple validation for key cases
        if case_type == "Merge" and vocabulary.MERGE_NEXT_ID not in word_tags: all_tests_passed = False
        if case_type == "Split" and vocabulary.SPLIT_ID not in word_tags: all_tests_passed = False
        if case_type == "Swap_Adj" and vocabulary.SWAP_NEXT_ID not in word_tags: all_tests_passed = False
        if case_type == "Swap_Complex" and vocabulary.SWAP_NEXT_ID not in word_tags: all_tests_passed = False
        if case_type == "Append (End)" and vocabulary.APPEND_ID not in word_tags: all_tests_passed = False
        if case_type == "Append (Start)" and vocabulary.APPEND_ID not in word_tags: all_tests_passed = False

    if all_tests_passed:
        print("\n✓ All critical logic tests passed!")
    else:
        print("\n✗ CRITICAL ERROR: Tagging logic verification failed. Halting.")
        return
    
    print("\n" + "=" * 70)
    print("STEP 3: LOADING RAW DATASET")
    print("=" * 70)
    try:
        raw_dataset = load_dataset(RAW_DATASET_NAME)
        print("Raw dataset structure:")
        print(raw_dataset)
        
        combined_raw = concatenate_datasets([raw_dataset['train'], raw_dataset['valid']])
        print(f"Total raw examples to process: {len(combined_raw):,}")
    except Exception as e:
        print(f"CRITICAL ERROR: Could not load raw dataset '{RAW_DATASET_NAME}'. {e}")
        return

    print("\n" + "=" * 70)
    print(f"STEP 4: PROCESSING DATASET (Using {NUM_WORKERS} workers)")
    print("This will take several minutes...")
    print("=" * 70)
    
    start_time = time.time()
    
    # Initialize worker globals for multiprocessing
    # The globals 'tokenizer' and 'vocabulary' are set in STEP 1
    # and will be inherited by the child processes.
    processed_dataset = combined_raw.map(
        preprocess_function,
        batched=True,
        batch_size=2000, 
        num_proc=NUM_WORKERS,
        remove_columns=combined_raw.column_names,
        load_from_cache_file=False
    )
    
    elapsed = time.time() - start_time
    print(f"\n✓ Dataset processing complete.")
    print(f"  Total examples generated: {len(processed_dataset):,}")
    print(f"  Time taken: {elapsed/60:.2f} minutes")
    print(f"  New features: {processed_dataset.features}")
    
    # --- ADDED: FIX FOR STRATIFIED SPLIT ---
    print("\n" + "=" * 70)
    print("STEP 4.5: CASTING STRATIFY KEY TO CLASSLABEL FOR STRATIFICATION")
    print("=" * 70)
    
    # The labels are the string representations of the integer IDs (0 to 9)
    class_names = [str(i) for i in range(vocabulary.vocab_size())]
    processed_dataset = processed_dataset.cast_column(
        "stratify_key", 
        datasets.ClassLabel(names=class_names)
    )
    print("✓ 'stratify_key' successfully cast to ClassLabel type for stratification.")
    # --- END FIX ---


    print("\n" + "=" * 70)
    print("STEP 5: CREATING STRATIFIED SPLITS")
    print("=" * 70)
    
    try:
        temp_split = processed_dataset.train_test_split(
            test_size=0.05,
            stratify_by_column="stratify_key",
            seed=42
        )
        train_valid_set = temp_split['train']
        calib_set = temp_split['test']

        final_split = train_valid_set.train_test_split(
            test_size=0.1579, # 0.15 / 0.95
            stratify_by_column="stratify_key",
            seed=42
        )
    except Exception as e:
        print(f"CRITICAL ERROR: Stratified split failed. {e}")
        print("Falling back to standard split...")
        temp_split = processed_dataset.train_test_split(test_size=0.05, seed=42)
        train_valid_set = temp_split['train']
        calib_set = temp_split['test']
        final_split = train_valid_set.train_test_split(test_size=0.1579, seed=42)


    final_dataset = DatasetDict({
        'train': final_split['train'],
        'validation': final_split['test'],
        'test': calib_set
    })

    # Clean up the stratification column
    final_dataset = final_dataset.remove_columns(["stratify_key"])

    print("Final dataset splits:")
    print(final_dataset)
    print(f"  Train: {len(final_dataset['train']):,}")
    print(f"  Valid: {len(final_dataset['validation']):,}")
    print(f"  Calib: {len(final_dataset['test']):,}")


    print("\n" + "=" * 70)
    print("STEP 6: SAVING DATASET LOCALLY")
    print("=" * 70)
    
    if os.path.exists(LOCAL_DATA_PATH):
        print(f"Warning: Deleting old local path: {LOCAL_DATA_PATH}")
        shutil.rmtree(LOCAL_DATA_PATH)
        
    final_dataset.save_to_disk(LOCAL_DATA_PATH)
    vocabulary.save(os.path.join(LOCAL_DATA_PATH, VOCAB_FILENAME))
    print(f"✓ Final dataset and vocabulary saved to {LOCAL_DATA_PATH}")


    print("\n" + "=" * 70)
    print("STEP 7: GENERATING README.MD DATASET CARD")
    print("=" * 70)

    # Generate the README.md content
    readme_content = f"""---
language:
- ne
license: mit
task_categories:
- token-classification
tags:
- grammatical-error-correction
- gec
- nepali
- gector
- sequence-tagging
size_categories:
- 1M<n<10M
---

# Nepali GEC (gector style) Token Tagging Dataset

This is a processed version of the [sumitaryal/nepali_grammatical_error_correction](https://huggingface.co/datasets/sumitaryal/nepali_grammatical_error_correction) dataset,
designed for training GEC-ToR-style sequence tagging models.

This dataset has been processed with a robust, multi-pass, content-aware alignment algorithm
to generate high-fidelity correction tags, **including complex and adjacent SWAP operations**.

## Total Examples: {len(processed_dataset):,}

* **Training:** {len(final_dataset['train']):,}
* **Validation:** {len(final_dataset['validation']):,}
* **test:** {len(final_dataset['test']):,}

## Key Features

- **Token-Level Tags:** Word-level corrections are mapped to subword (token) labels.
- **Correct Sentences Included:** The dataset contains both incorrect and correct sentences (tagged with `$KEEP`) for model stability.
- **Stratified Splits:** Splits are stratified by the dominant error tag to ensure balanced evaluation.

## Tag Vocabulary (10 Tags)

This dataset uses an enhanced 10-tag system:

1.  **`$KEEP`**: Token is correct.  -> label -> 0
2.  **`$DELETE`**: Token should be deleted. -> label -> 1
3.  **`$REPLACE`**: Token should be replaced (e.g., by a Transformer's MLM head). -> label -> 2
4.  **`$APPEND`**: A new token should be inserted *after* this token. -> label -> 3
5.  **`$SWAP_NEXT`**: Token is part of a swap (first word). -> label -> 4
6.  **`$SWAP_PREV`**: Token is part of a swap (second word). -> label -> 5
7.  **`$MERGE_NEXT`**: This token should be merged with the next token. -> label -> 6
8.  **`$MERGE_PREV`**: This token should be merged with the previous token. -> label -> 7
9.  **`$SPLIT`**: This token should be split into multiple tokens. -> label -> 8
10. **`$UNKNOWN`**: Fallback tag (should not be present). -> label -> 9

## Usage

```python
from datasets import load_dataset
import json
from huggingface_hub import hf_hub_download

REPO_ID = "{REPO_ID}"

# Load dataset
dataset = load_dataset(REPO_ID)

# Load vocabulary
vocab_file = hf_hub_download(
    repo_id=REPO_ID,
    filename="{VOCAB_FILENAME}",
    repo_type="dataset"
)

with open(vocab_file, 'r', encoding='utf-8') as f:
    vocabulary = json.load(f)

print(f"Splits: {{list(dataset.keys())}}")
print(f"Tags: {{vocabulary['tag_to_id']}}")
```
"""
    
    readme_path = os.path.join(LOCAL_DATA_PATH, "README.md")
    with open(readme_path, 'w', encoding='utf-8') as f:
        f.write(readme_content)
    print(f"✓ README.md saved to {readme_path}")


    print("\n" + "=" * 70)
    print("STEP 8: UPLOADING TO HUGGING FACE HUB")
    print("=" * 70)
    
    print(f"Authenticating as {HF_USERNAME}...")
    try:
        login(token=HF_TOKEN, add_to_git_credential=True)
        print("✓ Authentication successful.")
    except Exception as e:
        print(f"CRITICAL ERROR: Authentication failed. {e}")
        print("Please check your HF_TOKEN.")
        return

    print(f"Creating repository: {REPO_ID}")
    try:
        create_repo(repo_id=REPO_ID, repo_type="dataset", private=False, exist_ok=True)
        print(f"✓ Repository created or already exists.")
    except Exception as e:
        print(f"CRITICAL ERROR: Could not create repository. {e}")
        return

    print(f"Uploading all files from {LOCAL_DATA_PATH}...")
    api = HfApi()
    try:
        api.upload_folder(
            folder_path=LOCAL_DATA_PATH,
            repo_id=REPO_ID,
            repo_type="dataset",
            commit_message="Uploading GEC dataset with 10 tags (incl. Swap/Merge/Split)"
        )
        print("\n✅ UPLOAD COMPLETE!")
    except Exception as e:
        print(f"\nCRITICAL ERROR: Upload failed. {e}")
        print("Please check your permissions and internet connection.")
        return

    print("\n" + "=" * 70)
    print("STEP 9: VERIFYING UPLOAD")
    print("=" * 70)
    
    try:
        print(f"Loading dataset from Hub: {REPO_ID}")
        time.sleep(5) # Give the hub a moment to update
        verify_dataset = load_dataset(REPO_ID)
        print("✓ Verification successful! Dataset structure on Hub:")
        print(verify_dataset)
    except Exception as e:
        print(f"⚠ Verification failed. This might be a temporary delay. {e}")
        print(f"Please check your dataset manually at: https://huggingface.co/datasets/{REPO_ID}")

    print("\n" + "=" * 70)
    print("ALL STEPS COMPLETE.")
    print("=" * 70)


if __name__ == "__main__":
    # Ensure multiprocessing works correctly with Hugging Face tokenizers
    # This must be done *before* any other imports
    datasets.utils.logging.set_verbosity_error()
    
    # Run the full pipeline
    run_pipeline()

STEP 1: INITIALIZING TOKENIZER & VOCABULARY


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/968 [00:00<?, ?B/s]

✓ Tokenizer loaded: IRIIS-RESEARCH/RoBERTa_Nepali_125M
✓ Vocabulary loaded: 10 tags
Tags: ['$KEEP', '$DELETE', '$REPLACE', '$APPEND', '$SWAP_NEXT', '$SWAP_PREV', '$MERGE_NEXT', '$MERGE_PREV', '$SPLIT', '$UNKNOWN']

STEP 2: VERIFYING TAGGING LOGIC (COMPREHENSIVE TEST)

--- Test Case: Merge ---
  Input:    'म खाना खाँदै छु'
  Correct:  'म खाना खाँदैछु'
  Tags (Word-Level): [✅ $KEEP 'म'] [✅ $KEEP 'खाना'] [↘️ $MERGE_NEXT 'खाँदै'] [↙️ $MERGE_PREV 'छु'] 
  Stats:    {'merge_next': 1, 'merge_prev': 1, 'keep': 2}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁म              0          ✅ $KEEP             
    ▁खाना           1          ✅ $KEEP             
    ▁खाँदै          2          ↘️ $MERGE_NEXT      
    ▁छु             3          ↙️ $MERGE_PREV      

--- Test Case: Split ---
  Input:    'तपाईंको नामके हो'
  Correct:  'तपाईंको नाम के हो'
  Tags (Word-Level): [✅ $KEEP 'तपाईंको'] [✄ $SPLI

README.md:   0%|          | 0.00/451 [00:00<?, ?B/s]

data/train-00000-of-00007.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

data/train-00001-of-00007.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

data/train-00002-of-00007.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

data/train-00003-of-00007.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

data/train-00004-of-00007.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

data/train-00005-of-00007.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

data/train-00006-of-00007.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

data/valid-00000-of-00001.parquet:   0%|          | 0.00/79.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7723971 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/406525 [00:00<?, ? examples/s]

Raw dataset structure:
DatasetDict({
    train: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 7723971
    })
    valid: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 406525
    })
})
Total raw examples to process: 8,130,496

STEP 4: PROCESSING DATASET (Using 106 workers)
This will take several minutes...


Map (num_proc=106):   0%|          | 0/8130496 [00:00<?, ? examples/s]


✓ Dataset processing complete.
  Total examples generated: 16,260,992
  Time taken: 1.65 minutes
  New features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64')), 'is_correct': Value('bool'), 'tag_stats': Value('string'), 'stratify_key': Value('string')}

STEP 4.5: CASTING STRATIFY KEY TO CLASSLABEL FOR STRATIFICATION


Casting the dataset:   0%|          | 0/16260992 [00:00<?, ? examples/s]

✓ 'stratify_key' successfully cast to ClassLabel type for stratification.

STEP 5: CREATING STRATIFIED SPLITS
Final dataset splits:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 13008711
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 2439231
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 813050
    })
})
  Train: 13,008,711
  Valid: 2,439,231
  Calib: 813,050

STEP 6: SAVING DATASET LOCALLY


Saving the dataset (0/45 shards):   0%|          | 0/13008711 [00:00<?, ? examples/s]

Saving the dataset (0/9 shards):   0%|          | 0/2439231 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/813050 [00:00<?, ? examples/s]

Token has not been saved to git credential helper.


✓ Vocabulary saved: ./nepali-gector-style-token-level-tag-for-ged/gec_vocabulary.json
✓ Final dataset and vocabulary saved to ./nepali-gector-style-token-level-tag-for-ged

STEP 7: GENERATING README.MD DATASET CARD
✓ README.md saved to ./nepali-gector-style-token-level-tag-for-ged/README.md

STEP 8: UPLOADING TO HUGGING FACE HUB
Authenticating as DipeshChaudhary...
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
✓ Authentication successful.
Creating repository: DipeshChaudhary/nepali-gector-style-token-level-tag-for-ged
✓ Repository created or already exists.
Uploading all files from ./nepali-gector-style-token-level-tag-for-ged...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...-ged/test/data-00000-of-00003.arrow:   7%|7         | 33.5MB /  462MB            

  ...ged/train/data-00000-of-00045.arrow:   5%|5         | 25.1MB /  492MB            

  ...-ged/test/data-00001-of-00003.arrow:   5%|5         | 25.0MB /  462MB            

  ...ged/train/data-00001-of-00045.arrow:   5%|5         | 25.0MB /  492MB            

  ...-ged/test/data-00002-of-00003.arrow:   7%|7         | 33.5MB /  462MB            

  ...ged/train/data-00006-of-00045.arrow:   5%|5         | 25.1MB /  492MB            

  ...ged/train/data-00004-of-00045.arrow:   5%|5         | 25.1MB /  492MB            

  ...ged/train/data-00009-of-00045.arrow:   7%|6         | 33.5MB /  492MB            

  ...ged/train/data-00011-of-00045.arrow:   2%|1         | 8.31MB /  492MB            

  ...ged/train/data-00010-of-00045.arrow:   2%|1         | 8.29MB /  492MB            


✅ UPLOAD COMPLETE!

STEP 9: VERIFYING UPLOAD
Loading dataset from Hub: DipeshChaudhary/nepali-gector-style-token-level-tag-for-ged


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/45 [00:00<?, ?files/s]

train/data-00000-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00001-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00002-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00003-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00004-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00005-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00006-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00007-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00008-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00009-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00010-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00011-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00012-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00013-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00014-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00015-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00016-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00017-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00018-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00019-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00020-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00021-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00022-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00023-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00024-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00025-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00026-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00027-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00028-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00029-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00030-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00031-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00032-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00033-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00034-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00035-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00036-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00037-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00038-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00039-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00040-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00041-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00042-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00043-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

train/data-00044-of-00045.arrow:   0%|          | 0.00/492M [00:00<?, ?B/s]

validation/data-00000-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00001-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00002-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00003-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00004-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00005-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00006-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00007-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation/data-00008-of-00009.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

test/data-00000-of-00003.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

test/data-00001-of-00003.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

test/data-00002-of-00003.arrow:   0%|          | 0.00/462M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

✓ Verification successful! Dataset structure on Hub:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 13008711
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 2439231
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 813050
    })
})

ALL STEPS COMPLETE.


# output of the above cell

```
======================================================================
STEP 1: INITIALIZING TOKENIZER & VOCABULARY
======================================================================
tokenizer_config.json: 
 1.62k/? [00:00<00:00, 89.2kB/s]
tokenizer.json: 
 4.89M/? [00:00<00:00, 16.0MB/s]
special_tokens_map.json: 100%
 968/968 [00:00<00:00, 82.8kB/s]
✓ Tokenizer loaded: IRIIS-RESEARCH/RoBERTa_Nepali_125M
✓ Vocabulary loaded: 10 tags
Tags: ['$KEEP', '$DELETE', '$REPLACE', '$APPEND', '$SWAP_NEXT', '$SWAP_PREV', '$MERGE_NEXT', '$MERGE_PREV', '$SPLIT', '$UNKNOWN']

======================================================================
STEP 2: VERIFYING TAGGING LOGIC (COMPREHENSIVE TEST)
======================================================================

--- Test Case: Merge ---
  Input:    'म खाना खाँदै छु'
  Correct:  'म खाना खाँदैछु'
  Tags (Word-Level): [✅ $KEEP 'म'] [✅ $KEEP 'खाना'] [↘️ $MERGE_NEXT 'खाँदै'] [↙️ $MERGE_PREV 'छु'] 
  Stats:    {'merge_next': 1, 'merge_prev': 1, 'keep': 2}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁म              0          ✅ $KEEP             
    ▁खाना           1          ✅ $KEEP             
    ▁खाँदै          2          ↘️ $MERGE_NEXT      
    ▁छु             3          ↙️ $MERGE_PREV      

--- Test Case: Split ---
  Input:    'तपाईंको नामके हो'
  Correct:  'तपाईंको नाम के हो'
  Tags (Word-Level): [✅ $KEEP 'तपाईंको'] [✄ $SPLIT 'नामके'] [✅ $KEEP 'हो'] 
  Stats:    {'split': 1, 'keep': 2}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁तपाईंको        0          ✅ $KEEP             
    ▁नाम            1          ✄ $SPLIT            
    के              1          ✅ $KEEP             
    ▁हो             2          ✅ $KEEP             

--- Test Case: Swap_Adj ---
  Input:    'खाना म खान्छु'
  Correct:  'म खाना खान्छु'
  Tags (Word-Level): [↔️ $SWAP_NEXT 'खाना'] [↩️ $SWAP_PREV 'म'] [✅ $KEEP 'खान्छु'] 
  Stats:    {'swap_next': 1, 'swap_prev': 1, 'keep': 1}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁खाना           0          ↔️ $SWAP_NEXT       
    ▁म              1          ↩️ $SWAP_PREV       
    ▁खा             2          ✅ $KEEP             
    न्छु            2          ✅ $KEEP             

--- Test Case: Swap_Complex ---
  Input:    'सोध्न केही छ मलाई'
  Correct:  'मलाई सोध्न केही छ'
  Tags (Word-Level): [↔️ $SWAP_NEXT 'सोध्न'] [✅ $KEEP 'केही'] [✅ $KEEP 'छ'] [↩️ $SWAP_PREV 'मलाई'] 
  Stats:    {'swap_next': 1, 'swap_prev': 1, 'keep': 2}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁सोध्न          0          ↔️ $SWAP_NEXT       
    ▁केही           1          ✅ $KEEP             
    ▁छ              2          ✅ $KEEP             
    ▁मलाई           3          ↩️ $SWAP_PREV       

--- Test Case: Append (End) ---
  Input:    'म कलेज जान्छु'
  Correct:  'म कलेज जान्छु आज'
  Tags (Word-Level): [✅ $KEEP 'म'] [✅ $KEEP 'कलेज'] [➕ $APPEND 'जान्छु'] 
  Stats:    {'append': 1, 'keep': 2}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁म              0          ✅ $KEEP             
    ▁कलेज           1          ✅ $KEEP             
    ▁जान्छु         2          ➕ $APPEND           

--- Test Case: Append (Start) ---
  Input:    'ऊ घर गयो'
  Correct:  'आज ऊ घर गयो'
  Tags (Word-Level): [➕ $APPEND 'ऊ'] [✅ $KEEP 'घर'] [✅ $KEEP 'गयो'] 
  Stats:    {'append': 1, 'keep': 2}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁ऊ              0          ➕ $APPEND           
    ▁घर             1          ✅ $KEEP             
    ▁गयो            2          ✅ $KEEP             

--- Test Case: Delete ---
  Input:    'यो किताब धेरै राम्रो छ'
  Correct:  'यो किताब राम्रो छ'
  Tags (Word-Level): [✅ $KEEP 'यो'] [✅ $KEEP 'किताब'] [🗑️ $DELETE 'धेरै'] [✅ $KEEP 'राम्रो'] [✅ $KEEP 'छ'] 
  Stats:    {'delete': 1, 'keep': 4}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁यो             0          ✅ $KEEP             
    ▁किताब          1          ✅ $KEEP             
    ▁धेरै           2          🗑️ $DELETE          
    ▁राम्रो         3          ✅ $KEEP             
    ▁छ              4          ✅ $KEEP             

--- Test Case: Replace ---
  Input:    'मैले काम गरे'
  Correct:  'मैले काम गरें'
  Tags (Word-Level): [✅ $KEEP 'मैले'] [✅ $KEEP 'काम'] [🔄 $REPLACE 'गरे'] 
  Stats:    {'replace': 1, 'keep': 2}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁मैले           0          ✅ $KEEP             
    ▁काम            1          ✅ $KEEP             
    ▁गरे            2          🔄 $REPLACE          

--- Test Case: Neg_DR (No Merge) ---
  Input:    'यो रातो टोपी हो'
  Correct:  'यो नीलो टोपी हो'
  Tags (Word-Level): [✅ $KEEP 'यो'] [🔄 $REPLACE 'रातो'] [✅ $KEEP 'टोपी'] [✅ $KEEP 'हो'] 
  Stats:    {'replace': 1, 'keep': 3}

  Token-Level Mapping (Verification):
    Token           Word ID    Final Label         
    --------------- ---------- --------------------
    ▁यो             0          ✅ $KEEP             
    ▁रातो           1          🔄 $REPLACE          
    ▁टोपी           2          ✅ $KEEP             
    ▁हो             3          ✅ $KEEP             

✓ All critical logic tests passed!

======================================================================
STEP 3: LOADING RAW DATASET
======================================================================
README.md: 100%
 451/451 [00:00<00:00, 41.7kB/s]
data/train-00000-of-00007.parquet: 100%
 217M/217M [00:01<00:00, 171MB/s]
data/train-00001-of-00007.parquet: 100%
 217M/217M [00:02<00:00, 147MB/s]
data/train-00002-of-00007.parquet: 100%
 216M/216M [00:01<00:00, 182MB/s]
data/train-00003-of-00007.parquet: 100%
 217M/217M [00:01<00:00, 123MB/s]
data/train-00004-of-00007.parquet: 100%
 217M/217M [00:02<00:00, 130MB/s]
data/train-00005-of-00007.parquet: 100%
 217M/217M [00:02<00:00, 140MB/s]
data/train-00006-of-00007.parquet: 100%
 217M/217M [00:02<00:00, 130MB/s]
data/valid-00000-of-00001.parquet: 100%
 79.9M/79.9M [00:01<00:00, 58.9MB/s]
Generating train split: 100%
 7723971/7723971 [00:10<00:00, 897164.18 examples/s]
Generating valid split: 100%
 406525/406525 [00:00<00:00, 792514.21 examples/s]
Raw dataset structure:
DatasetDict({
    train: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 7723971
    })
    valid: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 406525
    })
})
Total raw examples to process: 8,130,496

======================================================================
STEP 4: PROCESSING DATASET (Using 106 workers)
This will take several minutes...
======================================================================
Map (num_proc=106): 100%
 8130496/8130496 [01:37<00:00, 12988.00 examples/s]

✓ Dataset processing complete.
  Total examples generated: 16,260,992
  Time taken: 1.65 minutes
  New features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64')), 'is_correct': Value('bool'), 'tag_stats': Value('string'), 'stratify_key': Value('string')}

======================================================================
STEP 4.5: CASTING STRATIFY KEY TO CLASSLABEL FOR STRATIFICATION
======================================================================
Casting the dataset: 100%
 16260992/16260992 [01:39<00:00, 141139.91 examples/s]
✓ 'stratify_key' successfully cast to ClassLabel type for stratification.

======================================================================
STEP 5: CREATING STRATIFIED SPLITS
======================================================================
Final dataset splits:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 13008711
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 2439231
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 813050
    })
})
  Train: 13,008,711
  Valid: 2,439,231
  Calib: 813,050

======================================================================
STEP 6: SAVING DATASET LOCALLY
======================================================================
Saving the dataset (45/45 shards): 100%
 13008711/13008711 [04:14<00:00, 57533.50 examples/s]
Saving the dataset (9/9 shards): 100%
 2439231/2439231 [00:45<00:00, 55115.58 examples/s]
Saving the dataset (3/3 shards): 100%
 813050/813050 [00:14<00:00, 52051.22 examples/s]
Token has not been saved to git credential helper.
✓ Vocabulary saved: ./nepali-gector-style-token-level-tag-for-ged/gec_vocabulary.json
✓ Final dataset and vocabulary saved to ./nepali-gector-style-token-level-tag-for-ged

======================================================================
STEP 7: GENERATING README.MD DATASET CARD
======================================================================
✓ README.md saved to ./nepali-gector-style-token-level-tag-for-ged/README.md

======================================================================
STEP 8: UPLOADING TO HUGGING FACE HUB
======================================================================
Authenticating as DipeshChaudhary...
Cannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.
✓ Authentication successful.
Creating repository: DipeshChaudhary/nepali-gector-style-token-level-tag-for-ged
✓ Repository created or already exists.
Uploading all files from ./nepali-gector-style-token-level-tag-for-ged...
Processing Files (57 / 57)              : 100%
 27.7GB / 27.7GB, 2.14GB/s  
New Data Upload                         : 
  0.00B /  0.00B,  0.00B/s  
  ...ged/train/data-00044-of-00045.arrow: 100%
  492MB /  492MB            
  ...alidation/data-00000-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00001-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00002-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00003-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00004-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00005-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00006-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00008-of-00009.arrow: 100%
  462MB /  462MB            
  ...alidation/data-00007-of-00009.arrow: 100%
  462MB /  462MB            

✅ UPLOAD COMPLETE!

======================================================================
STEP 9: VERIFYING UPLOAD
======================================================================
Loading dataset from Hub: DipeshChaudhary/nepali-gector-style-token-level-tag-for-ged
README.md: 
 2.48k/? [00:00<00:00, 127kB/s]
Resolving data files: 100%
 46/46 [00:00<00:00, 2476.77it/s]
Downloading data: 100%
 45/45 [01:49<00:00,  1.91s/files]
train/data-00000-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 407MB/s]
train/data-00001-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 230MB/s]
train/data-00002-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 312MB/s]
train/data-00003-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 441MB/s]
train/data-00004-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 533MB/s]
train/data-00005-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 396MB/s]
train/data-00006-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 264MB/s]
train/data-00007-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 262MB/s]
train/data-00008-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 250MB/s]
train/data-00009-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 309MB/s]
train/data-00010-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 287MB/s]
train/data-00011-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 385MB/s]
train/data-00012-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 301MB/s]
train/data-00013-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 218MB/s]
train/data-00014-of-00045.arrow: 100%
 492M/492M [00:04<00:00, 223MB/s]
train/data-00015-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 372MB/s]
train/data-00016-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 321MB/s]
train/data-00017-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 303MB/s]
train/data-00018-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 294MB/s]
train/data-00019-of-00045.arrow: 100%
 492M/492M [00:03<00:00, 223MB/s]
train/data-00020-of-00045.arrow: 100%
 492M/492M [00:03<00:00, 218MB/s]
train/data-00021-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 257MB/s]
train/data-00022-of-00045.arrow: 100%
 492M/492M [00:03<00:00, 247MB/s]
train/data-00023-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 244MB/s]
train/data-00024-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 199MB/s]
train/data-00025-of-00045.arrow: 100%
 492M/492M [00:03<00:00, 259MB/s]
train/data-00026-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 304MB/s]
train/data-00027-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 255MB/s]
train/data-00028-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 243MB/s]
train/data-00029-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 313MB/s]
train/data-00030-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 342MB/s]
train/data-00031-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 334MB/s]
train/data-00032-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 320MB/s]
train/data-00033-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 312MB/s]
train/data-00034-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 427MB/s]
train/data-00035-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 300MB/s]
train/data-00036-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 313MB/s]
train/data-00037-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 406MB/s]
train/data-00038-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 384MB/s]
train/data-00039-of-00045.arrow: 100%
 492M/492M [00:02<00:00, 345MB/s]
train/data-00040-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 330MB/s]
train/data-00041-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 331MB/s]
train/data-00042-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 328MB/s]
train/data-00043-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 389MB/s]
train/data-00044-of-00045.arrow: 100%
 492M/492M [00:01<00:00, 358MB/s]
validation/data-00000-of-00009.arrow: 100%
 462M/462M [00:01<00:00, 429MB/s]
validation/data-00001-of-00009.arrow: 100%
 462M/462M [00:01<00:00, 354MB/s]
validation/data-00002-of-00009.arrow: 100%
 462M/462M [00:01<00:00, 369MB/s]
validation/data-00003-of-00009.arrow: 100%
 462M/462M [00:01<00:00, 362MB/s]
validation/data-00004-of-00009.arrow: 100%
 462M/462M [00:01<00:00, 274MB/s]
validation/data-00005-of-00009.arrow: 100%
 462M/462M [00:02<00:00, 247MB/s]
validation/data-00006-of-00009.arrow: 100%
 462M/462M [00:03<00:00, 210MB/s]
validation/data-00007-of-00009.arrow: 100%
 462M/462M [00:01<00:00, 286MB/s]
validation/data-00008-of-00009.arrow: 100%
 462M/462M [00:02<00:00, 255MB/s]
test/data-00000-of-00003.arrow: 100%
 462M/462M [00:02<00:00, 301MB/s]
test/data-00001-of-00003.arrow: 100%
 462M/462M [00:02<00:00, 313MB/s]
test/data-00002-of-00003.arrow: 100%
 462M/462M [00:01<00:00, 323MB/s]
Generating train split: 
 13008711/0 [01:29<00:00, 313522.31 examples/s]
Generating validation split: 
 2439231/0 [00:10<00:00, 90949.60 examples/s]
Generating test split: 
 813050/0 [00:05<00:00, 118811.33 examples/s]
Loading dataset shards: 100%
 45/45 [00:00<00:00, 85.84it/s]
✓ Verification successful! Dataset structure on Hub:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 13008711
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 2439231
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'is_correct', 'tag_stats'],
        num_rows: 813050
    })
})

======================================================================
ALL STEPS COMPLETE.
======================================================================
```