In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


SP_BPE training

In [None]:
import os
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
import sentencepiece as spm

def monitor_resource_usage(func):
    import time, psutil
    def wrapper(*args, **kwargs):
        import os
        process = psutil.Process(os.getpid())
        start_time = time.time()
        start_mem = process.memory_info().rss

        print(f"🚀 Starting: {func.__name__}")
        result = func(*args, **kwargs)

        end_time = time.time()
        end_mem = process.memory_info().rss
        print(f"✅ Completed in {end_time - start_time:.2f}s, RAM used: {(end_mem - start_mem)/1e6:.2f} MB\n")
        return result
    return wrapper

# Training functions
@monitor_resource_usage
def train_hf_bpe(file_path, folder_path, vocab_size):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    tokenizer.train([file_path], trainer)
    tokenizer.save(os.path.join(folder_path, "hf_bpe.json"))

@monitor_resource_usage
def train_hf_wordpiece(file_path, folder_path, vocab_size):
    tokenizer = Tokenizer(models.WordPiece())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    tokenizer.train([file_path], trainer)
    tokenizer.save(os.path.join(folder_path, "hf_wordpiece.json"))

@monitor_resource_usage
def train_sp_bpe(file_path, folder_path, vocab_size):
    spm.SentencePieceTrainer.train(
        input=file_path,
        model_prefix=os.path.join(folder_path, "sp_bpe"),
        vocab_size=vocab_size,
        model_type='bpe',
        character_coverage=1.0,
        unk_id=0,
        pad_id=1,
        bos_id=2,
        eos_id=3,
        unk_piece="<unk>",
        pad_piece="<pad>",
        bos_piece="<s>",
        eos_piece="</s>",
        user_defined_symbols=["<mask>"]
    )

@monitor_resource_usage
def train_sp_unigram(file_path, folder_path, vocab_size):
    spm.SentencePieceTrainer.train(
        input=file_path,
        model_prefix=os.path.join(folder_path, "sp_unigram"),
        vocab_size=vocab_size,
        model_type='unigram',
        character_coverage=1.0
    )

# Master training function
def train_all_tokenizers(file_path, folder_path, vocab_size):
    os.makedirs(folder_path, exist_ok=True)
    # train_hf_bpe(file_path, folder_path, vocab_size)
    # train_hf_wordpiece(file_path, folder_path, vocab_size)
    train_sp_bpe(file_path, folder_path, vocab_size)
    # train_sp_unigram(file_path, folder_path, vocab_size)

# Run for all 3 corpus sizes
base_input = r"/content/drive/MyDrive/Tokenizer_New/balanced_normalized/final_balanced_"
base_output = r"/content/drive/MyDrive/Tokenizer_New"

settings = {
    "small": 15000,
    "medium": 30000,
    "large": 50000
}

for size, vocab_size in settings.items():
    print(f"\n📦 Training tokenizers for: {size.upper()} (Vocab size: {vocab_size})")
    file_path = base_input + f"{size}.txt"
    folder_path = base_output + size
    train_all_tokenizers(file_path, folder_path, vocab_size)



📦 Training tokenizers for: SMALL (Vocab size: 15000)
🚀 Starting: train_sp_bpe
✅ Completed in 135.99s, RAM used: 1960.63 MB


📦 Training tokenizers for: MEDIUM (Vocab size: 30000)
🚀 Starting: train_sp_bpe
✅ Completed in 486.55s, RAM used: 473.91 MB


📦 Training tokenizers for: LARGE (Vocab size: 50000)
🚀 Starting: train_sp_bpe
✅ Completed in 1209.71s, RAM used: 180.33 MB



In [None]:
!pip install --upgrade datasets fsspec

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver 

In [None]:
import os
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast

# Existing tokenizer paths
parent_dir = r"/content/drive/MyDrive/Tokeizers/vocab_final"
sizes = ["small", "medium", "large"]
tokenizer_types = [
    ("hf_bpe", "hf_bpe.json"),
    ("hf_wordpiece", "hf_wordpiece.json")
]

for size in sizes:
    for tok_dir, tok_file in tokenizer_types:
        input_json_path = os.path.join(parent_dir, f"vocab_final{size}", tok_dir, tok_file)
        output_tokenizer_dir = os.path.join(parent_dir, f"vocab_final{size}", f"{tok_dir}_hf")

        if not os.path.exists(input_json_path):
            print(f"❌ Not found: {input_json_path} (Skipping)")
            continue

        os.makedirs(output_tokenizer_dir, exist_ok=True)
        print(f"🔁 Rewrapping {input_json_path} -> {output_tokenizer_dir}")

        try:
            # Load original tokenizer trained with Tokenizers library
            tokenizer_obj = Tokenizer.from_file(input_json_path)

            # Wrap with Hugging Face-compatible tokenizer
            hf_tokenizer = PreTrainedTokenizerFast(
                tokenizer_object=tokenizer_obj,
                unk_token="[UNK]",
                pad_token="[PAD]",
                cls_token="[CLS]",
                sep_token="[SEP]",
                mask_token="[MASK]"
            )

            # Save in HF format: outputs tokenizer.json, config, special_tokens map
            hf_tokenizer.save_pretrained(output_tokenizer_dir)
            print(f"✅ Wrapped and saved to: {output_tokenizer_dir}\n")

        except Exception as e:
            print(f"❌ Failed to convert {input_json_path}: {e}\n")


🔁 Rewrapping /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_bpe/hf_bpe.json -> /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_bpe_hf
✅ Wrapped and saved to: /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_bpe_hf

🔁 Rewrapping /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_wordpiece/hf_wordpiece.json -> /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_wordpiece_hf
✅ Wrapped and saved to: /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_wordpiece_hf

🔁 Rewrapping /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalmedium/hf_bpe/hf_bpe.json -> /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalmedium/hf_bpe_hf
✅ Wrapped and saved to: /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalmedium/hf_bpe_hf

🔁 Rewrapping /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalmedium/hf_wordpiece/hf_wordpiece.json -> /content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalmedium/hf

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    r"/content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_bpe_hf"
)


In [None]:
import json, os

path = r"/content/drive/MyDrive/Tokeizers/vocab_final/vocab_finalsmall/hf_bpe_hf/tokenizer.json"
with open(path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"✅ tokenizer.json loaded successfully. Keys: {list(data.keys())}")


✅ tokenizer.json loaded successfully. Keys: ['version', 'truncation', 'padding', 'added_tokens', 'normalizer', 'pre_tokenizer', 'post_processor', 'decoder', 'model']


In [None]:
!pip install protobuf



In [None]:
!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py

--2025-07-22 07:04:47--  https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6257 (6.1K) [text/plain]
Saving to: ‘sentencepiece_model_pb2.py’


2025-07-22 07:04:47 (64.8 MB/s) - ‘sentencepiece_model_pb2.py’ saved [6257/6257]



In [None]:
import os
import sentencepiece as spm
from tokenizers.implementations import SentencePieceUnigramTokenizer
from tokenizers.processors import BertProcessing
from transformers import PreTrainedTokenizerFast

def get_spm_special_tokens(sp_model_path):
    sp = spm.SentencePieceProcessor()
    sp.load(sp_model_path)
    vocab = {sp.id_to_piece(i) for i in range(sp.get_piece_size())}

    special_tokens = {
        "unk_token": "<unk>",   # Force-insert
        "pad_token": "<pad>",
        "cls_token": "<s>",
        "sep_token": "</s>",
        "mask_token": "<mask>"
    }

    for tok in special_tokens.values():
        if tok not in vocab:
            print(f"⚠️ Missing special token in vocab: {tok}")

    return special_tokens


def convert_sp_model_to_hf(sp_model_path, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    # Find which special tokens the model actually has
    special_tokens = get_spm_special_tokens(sp_model_path)

    required = ['unk_token', 'pad_token', 'cls_token', 'sep_token']
    for req in required:
        if req not in special_tokens:
            print(f"⚠️ WARNING: Required special token {req} missing in vocab of {sp_model_path}. "
                  f"Downstream issues may occur.")

    # Use only existing tokens as special
    tokenizer = SentencePieceUnigramTokenizer.from_spm(sp_model_path)
    tokenizer.add_special_tokens(list(special_tokens.values()))
    if "cls_token" in special_tokens and "sep_token" in special_tokens:
        tokenizer.post_processor = BertProcessing(
            (special_tokens["sep_token"], tokenizer.token_to_id(special_tokens["sep_token"])),
            (special_tokens["cls_token"], tokenizer.token_to_id(special_tokens["cls_token"]))
        )
    tokenizer_json_path = os.path.join(save_dir, "tokenizer.json")
    tokenizer.save(tokenizer_json_path)
    hf_tokenizer = PreTrainedTokenizerFast(
        tokenizer_file=tokenizer_json_path,
        **special_tokens
    )
    hf_tokenizer.save_pretrained(save_dir)
    print(f"✅ Saved Hugging Face-compatible tokenizer to {save_dir} with special tokens: {special_tokens}")

# Apply to all your unigram models
parent_dir = "/content/drive/MyDrive/Tokenizer_New/vocab_final"
sizes = ["small", "medium", "large"]
sp_tokenizer_types = [
    ("sp_unigram", "sp_unigram.model"),
]

for size in sizes:
    for tok_dir, sp_model_file in sp_tokenizer_types:
        sp_model_path = os.path.join(parent_dir, f"vocab_final{size}", tok_dir, sp_model_file)
        save_dir = os.path.join(parent_dir, f"vocab_final{size}", tok_dir + "_hf")
        if not os.path.exists(sp_model_path):
            print(f"❌ {sp_model_path} not found. Skipping.")
            continue
        try:
            convert_sp_model_to_hf(sp_model_path, save_dir)
        except Exception as e:
            print(f"❌ Failed for {sp_model_path}: {e}")


⚠️ Missing special token in vocab: <pad>
⚠️ Missing special token in vocab: <mask>
✅ Saved Hugging Face-compatible tokenizer to /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalsmall/sp_unigram_hf with special tokens: {'unk_token': '<unk>', 'pad_token': '<pad>', 'cls_token': '<s>', 'sep_token': '</s>', 'mask_token': '<mask>'}
⚠️ Missing special token in vocab: <pad>
⚠️ Missing special token in vocab: <mask>
✅ Saved Hugging Face-compatible tokenizer to /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalmedium/sp_unigram_hf with special tokens: {'unk_token': '<unk>', 'pad_token': '<pad>', 'cls_token': '<s>', 'sep_token': '</s>', 'mask_token': '<mask>'}
⚠️ Missing special token in vocab: <pad>
⚠️ Missing special token in vocab: <mask>
✅ Saved Hugging Face-compatible tokenizer to /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finallarge/sp_unigram_hf with special tokens: {'unk_token': '<unk>', 'pad_token': '<pad>', 'cls_token': '<s>', 'sep_token': '</s>', 'mask_

In [None]:
# # import os
# # from tokenizers.implementations import SentencePieceBPETokenizer
# # from tokenizers.processors import BertProcessing
# # from transformers import PreTrainedTokenizerFast

# # def convert_sp_bpe_to_hf(sp_model_path, save_dir):
# #     os.makedirs(save_dir, exist_ok=True)

# #     # Define the special tokens your SPM-BPE model was trained with
# #     special_tokens = {
# #         "unk_token": "<unk>",
# #         "pad_token": "<pad>",
# #         "cls_token": "<s>",
# #         "sep_token": "</s>",
# #         "mask_token": "<mask>"
# #     }

# #     # Load the SentencePiece BPE tokenizer
# #     tokenizer = SentencePieceBPETokenizer(sp_model_path)

# #     # Add the special tokens to tokenizer
# #     tokenizer.add_special_tokens(list(special_tokens.values()))

# #     # Set up BERT-style [CLS] and [SEP] post-processing
# #     tokenizer.post_processor = BertProcessing(
# #         (special_tokens["sep_token"], tokenizer.token_to_id(special_tokens["sep_token"])),
# #         (special_tokens["cls_token"], tokenizer.token_to_id(special_tokens["cls_token"]))
# #     )

# #     # Save tokenizer to tokenizer.json
# #     tokenizer_json_path = os.path.join(save_dir, "tokenizer.json")
# #     tokenizer.save(tokenizer_json_path)

# #     # Wrap it using PreTrainedTokenizerFast for HF compatibility
# #     hf_tokenizer = PreTrainedTokenizerFast(
# #         tokenizer_file=tokenizer_json_path,
# #         **special_tokens
# #     )

# #     # Save to Hugging Face format
# #     hf_tokenizer.save_pretrained(save_dir)
# #     print(f"✅ Saved Hugging Face tokenizer to {save_dir} with special tokens: {special_tokens}")


# # # === Use this for batch conversion for sp_bpe only ===
# # parent_dir = "/content/drive/MyDrive/Tokenizer_New"
# # sizes = ["small", "medium", "large"]
# # sp_tokenizer_types = [
# #     ("sp_bpe", "sp_bpe.model"),
# # ]

# # for size in sizes:
# #     for tok_dir, sp_model_file in sp_tokenizer_types:
# #         sp_model_path = os.path.join(parent_dir+f"{size}", sp_model_file)
# #         save_dir = os.path.join(parent_dir+f"{size}", f"vocab_final{size}", tok_dir + "_hf")

# #         if not os.path.exists(sp_model_path):
# #             print(f"❌ {sp_model_path} not found. Skipping.")
# #             continue

# #         try:
# #             convert_sp_bpe_to_hf(sp_model_path, save_dir)
# #         except Exception as e:
# #             print(f"❌ Failed for {sp_model_path}: {e}")

# import os
# from transformers import PreTrainedTokenizerFast

# def convert_sp_bpe_to_hf(sp_model_path, save_dir):
#     os.makedirs(save_dir, exist_ok=True)

#     # Define special tokens consistent with SentencePiece training
#     special_tokens = {
#         "unk_token": "<unk>",
#         "pad_token": "<pad>",
#         "cls_token": "<s>",
#         "sep_token": "</s>",
#         "mask_token": "<mask>"
#     }

#     # Create Hugging Face-compatible tokenizer directly from the .model file
#     tokenizer = PreTrainedTokenizerFast(
#         sp_model_kwargs={"model_file": sp_model_path},
#         **special_tokens
#     )

#     # Save tokenizer in Hugging Face format
#     tokenizer.save_pretrained(save_dir)
#     print(f"✅ Saved HF-compatible tokenizer to: {save_dir}")
#     print(f"🧠 Special tokens used: {special_tokens}")


# # === Batch convert SentencePiece .model files for each size ===
# parent_dir = "/content/drive/MyDrive/Tokenizer_New"
# sizes = ["small", "medium", "large"]
# sp_tokenizer_types = [
#     ("sp_bpe", "sp_bpe.model"),
# ]

# for size in sizes:
#     for tok_dir, sp_model_file in sp_tokenizer_types:
#         sp_model_path = os.path.join(parent_dir, f"Tokenizer_New{size}", sp_model_file)
#         save_dir = os.path.join(parent_dir ,f"Tokenizer_New{size}", f"vocab_final{size}", tok_dir + "_hf")

#         if not os.path.exists(sp_model_path):
#             print(f"❌ {sp_model_path} not found. Skipping.")
#             continue

#         try:
#             convert_sp_bpe_to_hf(sp_model_path, save_dir)
#         except Exception as e:
#             print(f"❌ Failed for {sp_model_path}: {e}")


❌ Failed for /content/drive/MyDrive/Tokenizer_New/Tokenizer_Newsmall/sp_bpe.model: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokeniz

In [None]:
import os
import json
import torch
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    BertForTokenClassification,
    Trainer,
    TrainingArguments
)
from torch.utils.data import Dataset
from sklearn.metrics import classification_report


In [None]:
# Load JSONL
def load_jsonl(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

# Label dictionaries (shared across all tokenizers)
# Label dictionaries (shared across all tokenizers)
def build_label_maps(*datasets):
    all_tags = set()
    for dataset in datasets:
        for item in dataset:
            # Ensure tags are strings
            clean_tags = [str(tag) for tag in item["tags"]]
            all_tags.update(clean_tags)
            item["tags"] = clean_tags  # Update reference in memory too

    # Ensure 'O' is always in the tags set
    if 'O' not in all_tags:
        all_tags.add('O')

    tags = sorted(list(all_tags))
    label2id = {tag: i for i, tag in enumerate(tags)}
    id2label = {i: tag for tag, i in label2id.items()}
    return label2id, id2label

# PyTorch Dataset for BERT Token Classification
class PosDataset(Dataset):
    def __init__(self, data, tokenizer, label2id, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens, tags = item["tokens"], item["tags"]

        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            max_length=self.max_length,
            return_offsets_mapping=True,
            truncation=True,
            padding="max_length"
        )

        labels = [-100] * len(encoding["input_ids"])
        word_ids = encoding.word_ids()
        prev_word = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                continue
            if word_id != prev_word:
                labels[i] = self.label2id.get(tags[word_id], self.label2id['O'])
            prev_word = word_id

        encoding["labels"] = labels
        return {k: torch.tensor(v) for k, v in encoding.items() if k in ["input_ids", "attention_mask", "labels"]}

In [None]:
def compute_metrics(pred, id2label):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids

    true_tags, pred_tags = [], []

    for p_seq, l_seq in zip(preds, labels):
        for p, l in zip(p_seq, l_seq):
            if l != -100:
                true_tags.append(id2label[l])
                pred_tags.append(id2label[p])

    report = classification_report(true_tags, pred_tags, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1_macro": report["macro avg"]["f1-score"],
        "f1_weighted": report["weighted avg"]["f1-score"],
        "report": report
    }


In [None]:
base_path = r"/content/drive/MyDrive/Tokenizer_New"
# tokenizer_base = os.path.join(base_path, 'vocab_final')

dataset_dir = os.path.join(base_path, 'output')

# Paths
train_path = os.path.join(dataset_dir, 'train_pos.jsonl')
test_path = os.path.join(dataset_dir, 'test_pos.jsonl')
# accuracy_log_file = "POS_accuracy.txt"

# Clear existing accuracy log
# with open(accuracy_log_file, 'w'): pass

# Tokenizer settings
# tokenizer_sizes = ["small", "medium", "large"]
# tokenizer_types = ["hf_bpe_hf", "hf_wordpiece_hf", "sp_unigram_hf"]
tokenizer_sizes = ["small"]
tokenizer_types = ["hf_wordpiece_hf"]
# tokenizer_types = [ "sp_bpe_hf"]


# Only 1 model retraining

In [None]:


import os
import csv
import torch
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from transformers import (
    AutoTokenizer,
    BertForTokenClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    logging as hf_logging
)

# Suppress warnings and Hugging Face logs
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
hf_logging.set_verbosity_error()

# === 🧩 Custom Callback to print metrics after each epoch ===
class PrintMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            print("\n📊 Evaluation Results:")
            for key, value in metrics.items():
                if isinstance(value, float):
                    print(f"  {key}: {value:.4f}")

# === 🧠 Metric Computation Function (No detailed report) ===
def compute_metrics(pred, id2label):
    from sklearn.metrics import classification_report
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    true_tags, pred_tags = [], []

    for p_seq, l_seq in zip(preds, labels):
        for p, l in zip(p_seq, l_seq):
            if l != -100:
                true_tags.append(id2label[l])
                pred_tags.append(id2label[p])

    report = classification_report(true_tags, pred_tags, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1_macro": report["macro avg"]["f1-score"],
        "f1_weighted": report["weighted avg"]["f1-score"]
    }

# === 🚀 Load Data (Assumes helper functions exist) ===
train_data = load_jsonl(train_path)
test_data = load_jsonl(test_path)
label2id, id2label = build_label_maps(train_data, test_data)
num_labels = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 📂 Logging Setup ===
accuracy_log_file = "/content/drive/MyDrive/Tokenizer_New/POS_models/accuracy_log.csv"
os.makedirs(os.path.dirname(accuracy_log_file), exist_ok=True)
csv_header = [
    "Model Name", "Epoch",
    "Train Accuracy", "Train F1 Macro", "Train F1 Weighted",
    "Test Accuracy", "Test F1 Macro", "Test F1 Weighted"
]
if not os.path.isfile(accuracy_log_file):
    with open(accuracy_log_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv.writer(csvfile).writerow(csv_header)

# === 🔄 Model Training Loop ===
for size in tokenizer_sizes:
    for tok_type in tokenizer_types:
        model_name = f"{size}_{tok_type}_POS"
        tokenizer_path = os.path.join(base_path, "vocab_final", f"vocab_final{size}", tok_type)
        model_output_dir = f"/content/drive/MyDrive/Tokenizer_New/POS_models/{model_name}"

        print(f"\n🔧 Running: {model_name}")
        print("🔍 Loading tokenizer from:", tokenizer_path)

        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
        train_dataset = PosDataset(train_data, tokenizer, label2id)
        test_dataset = PosDataset(test_data, tokenizer, label2id)

        model = BertForTokenClassification.from_pretrained(
            "bert-base-cased",
            num_labels=num_labels,
            label2id=label2id,
            id2label=id2label,
            ignore_mismatched_sizes=True
        )
        model.resize_token_embeddings(len(tokenizer))
        model.to(device)

        args = TrainingArguments(
            output_dir=model_output_dir,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,  # ⬅️ Adjust as needed
            eval_strategy="epoch",
            logging_strategy="epoch",
            learning_rate=5e-5,
            logging_dir=os.path.join(model_output_dir, "logs"),
            save_strategy="no",
            report_to=[],
            disable_tqdm=False  # ⬅️ Show progress bar
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            compute_metrics=lambda x: compute_metrics(x, id2label),
            callbacks=[PrintMetricsCallback()]  # 👈 Add metrics logger
        )

        trainer.train()
        train_metrics = trainer.evaluate(train_dataset)
        test_metrics = trainer.evaluate(test_dataset)

        print(f"✅ {model_name} training completed")
        print("📌 Logging results to CSV")

        epoch = 3  # Update if looping across epochs
        with open(accuracy_log_file, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([
                model_name, epoch,
                round(train_metrics.get('eval_accuracy', 0.0), 4),
                round(train_metrics.get('eval_f1_macro', 0.0), 4),
                round(train_metrics.get('eval_f1_weighted', 0.0), 4),
                round(test_metrics.get('eval_accuracy', 0.0), 4),
                round(test_metrics.get('eval_f1_macro', 0.0), 4),
                round(test_metrics.get('eval_f1_weighted', 0.0), 4),
            ])

        model.save_pretrained(model_output_dir)
        tokenizer.save_pretrained(model_output_dir)
        torch.cuda.empty_cache()



🔧 Running: small_hf_wordpiece_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalsmall/hf_wordpiece_hf


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.6846,1.090986,0.668331,0.247713,0.640194
2,0.9304,0.804323,0.752239,0.340665,0.737343
3,0.7076,0.738841,0.773577,0.380381,0.760934



📊 Evaluation Results:
  eval_loss: 1.0910
  eval_accuracy: 0.6683
  eval_f1_macro: 0.2477
  eval_f1_weighted: 0.6402
  eval_runtime: 9.8821
  eval_samples_per_second: 126.2890
  eval_steps_per_second: 7.8930
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 0.8043
  eval_accuracy: 0.7522
  eval_f1_macro: 0.3407
  eval_f1_weighted: 0.7373
  eval_runtime: 9.8947
  eval_samples_per_second: 126.1290
  eval_steps_per_second: 7.8830
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.7388
  eval_accuracy: 0.7736
  eval_f1_macro: 0.3804
  eval_f1_weighted: 0.7609
  eval_runtime: 9.8967
  eval_samples_per_second: 126.1030
  eval_steps_per_second: 7.8810
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.5422
  eval_accuracy: 0.8309
  eval_f1_macro: 0.3838
  eval_f1_weighted: 0.8206
  eval_runtime: 39.5613
  eval_samples_per_second: 126.1080
  eval_steps_per_second: 7.8860
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.7388
  eval_accuracy: 0.7736
  eval_f1_macro: 0.3804
  eval_f1_weighted: 0.7609
  eval_runtime: 9.9642
  eval_samples_per_second: 125.2490
  eval_steps_per_second: 7.8280
  epoch: 3.0000
✅ small_hf_wordpiece_hf_POS training completed
📌 Logging results to CSV


# fINAL pos

In [None]:


import os
import csv
import torch
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from transformers import (
    AutoTokenizer,
    BertForTokenClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    logging as hf_logging
)

# Suppress warnings and Hugging Face logs
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
hf_logging.set_verbosity_error()

# === 🧩 Custom Callback to print metrics after each epoch ===
class PrintMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            print("\n📊 Evaluation Results:")
            for key, value in metrics.items():
                if isinstance(value, float):
                    print(f"  {key}: {value:.4f}")

# === 🧠 Metric Computation Function (No detailed report) ===
def compute_metrics(pred, id2label):
    from sklearn.metrics import classification_report
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    true_tags, pred_tags = [], []

    for p_seq, l_seq in zip(preds, labels):
        for p, l in zip(p_seq, l_seq):
            if l != -100:
                true_tags.append(id2label[l])
                pred_tags.append(id2label[p])

    report = classification_report(true_tags, pred_tags, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1_macro": report["macro avg"]["f1-score"],
        "f1_weighted": report["weighted avg"]["f1-score"]
    }

# === 🚀 Load Data (Assumes helper functions exist) ===
train_data = load_jsonl(train_path)
test_data = load_jsonl(test_path)
label2id, id2label = build_label_maps(train_data, test_data)
num_labels = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 📂 Logging Setup ===
accuracy_log_file = "/content/drive/MyDrive/Tokenizer_New/POS_models/accuracy_log.csv"
os.makedirs(os.path.dirname(accuracy_log_file), exist_ok=True)
csv_header = [
    "Model Name", "Epoch",
    "Train Accuracy", "Train F1 Macro", "Train F1 Weighted",
    "Test Accuracy", "Test F1 Macro", "Test F1 Weighted"
]
if not os.path.isfile(accuracy_log_file):
    with open(accuracy_log_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv.writer(csvfile).writerow(csv_header)

# === 🔄 Model Training Loop ===
for size in tokenizer_sizes:
    for tok_type in tokenizer_types:
        model_name = f"{size}_{tok_type}_POS"
        tokenizer_path = os.path.join(base_path, "vocab_final", f"vocab_final{size}", tok_type)
        model_output_dir = f"/content/drive/MyDrive/Tokenizer_New/POS_models/{model_name}"

        print(f"\n🔧 Running: {model_name}")
        print("🔍 Loading tokenizer from:", tokenizer_path)

        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
        train_dataset = PosDataset(train_data, tokenizer, label2id)
        test_dataset = PosDataset(test_data, tokenizer, label2id)

        model = BertForTokenClassification.from_pretrained(
            "bert-base-cased",
            num_labels=num_labels,
            label2id=label2id,
            id2label=id2label,
            ignore_mismatched_sizes=True
        )
        model.resize_token_embeddings(len(tokenizer))
        model.to(device)

        args = TrainingArguments(
            output_dir=model_output_dir,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,  # ⬅️ Adjust as needed
            eval_strategy="epoch",
            logging_strategy="epoch",
            learning_rate=5e-5,
            logging_dir=os.path.join(model_output_dir, "logs"),
            save_strategy="no",
            report_to=[],
            disable_tqdm=False  # ⬅️ Show progress bar
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            compute_metrics=lambda x: compute_metrics(x, id2label),
            callbacks=[PrintMetricsCallback()]  # 👈 Add metrics logger
        )

        trainer.train()
        train_metrics = trainer.evaluate(train_dataset)
        test_metrics = trainer.evaluate(test_dataset)

        print(f"✅ {model_name} training completed")
        print("📌 Logging results to CSV")

        epoch = 3  # Update if looping across epochs
        with open(accuracy_log_file, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([
                model_name, epoch,
                round(train_metrics.get('eval_accuracy', 0.0), 4),
                round(train_metrics.get('eval_f1_macro', 0.0), 4),
                round(train_metrics.get('eval_f1_weighted', 0.0), 4),
                round(test_metrics.get('eval_accuracy', 0.0), 4),
                round(test_metrics.get('eval_f1_macro', 0.0), 4),
                round(test_metrics.get('eval_f1_weighted', 0.0), 4),
            ])

        model.save_pretrained(model_output_dir)
        tokenizer.save_pretrained(model_output_dir)
        torch.cuda.empty_cache()



🔧 Running: small_hf_bpe_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalsmall/hf_bpe_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.8206,1.253701,0.622557,0.216492,0.596295
2,1.0981,0.980833,0.695459,0.278272,0.67896
3,0.8684,0.913823,0.720705,0.297295,0.705706



📊 Evaluation Results:
  eval_loss: 1.2537
  eval_accuracy: 0.6226
  eval_f1_macro: 0.2165
  eval_f1_weighted: 0.5963
  eval_runtime: 8.8563
  eval_samples_per_second: 140.9170
  eval_steps_per_second: 8.8070
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 0.9808
  eval_accuracy: 0.6955
  eval_f1_macro: 0.2783
  eval_f1_weighted: 0.6790
  eval_runtime: 8.7022
  eval_samples_per_second: 143.4120
  eval_steps_per_second: 8.9630
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.9138
  eval_accuracy: 0.7207
  eval_f1_macro: 0.2973
  eval_f1_weighted: 0.7057
  eval_runtime: 8.7907
  eval_samples_per_second: 141.9690
  eval_steps_per_second: 8.8730
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.7000
  eval_accuracy: 0.7847
  eval_f1_macro: 0.2957
  eval_f1_weighted: 0.7722
  eval_runtime: 35.2379
  eval_samples_per_second: 141.5800
  eval_steps_per_second: 8.8540
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.9138
  eval_accuracy: 0.7207
  eval_f1_macro: 0.2973
  eval_f1_weighted: 0.7057
  eval_runtime: 8.6757
  eval_samples_per_second: 143.8490
  eval_steps_per_second: 8.9910
  epoch: 3.0000
✅ small_hf_bpe_hf_POS training completed
📌 Logging results to CSV

🔧 Running: small_hf_wordpiece_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalsmall/hf_wordpiece_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.6788,1.058406,0.67755,0.255755,0.659275
2,0.9007,0.766397,0.763625,0.357712,0.751308
3,0.6821,0.709198,0.783033,0.395188,0.772219



📊 Evaluation Results:
  eval_loss: 1.0584
  eval_accuracy: 0.6776
  eval_f1_macro: 0.2558
  eval_f1_weighted: 0.6593
  eval_runtime: 8.7115
  eval_samples_per_second: 143.2590
  eval_steps_per_second: 8.9540
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 0.7664
  eval_accuracy: 0.7636
  eval_f1_macro: 0.3577
  eval_f1_weighted: 0.7513
  eval_runtime: 8.8034
  eval_samples_per_second: 141.7630
  eval_steps_per_second: 8.8600
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.7092
  eval_accuracy: 0.7830
  eval_f1_macro: 0.3952
  eval_f1_weighted: 0.7722
  eval_runtime: 8.8583
  eval_samples_per_second: 140.8850
  eval_steps_per_second: 8.8050
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.5149
  eval_accuracy: 0.8413
  eval_f1_macro: 0.3889
  eval_f1_weighted: 0.8325
  eval_runtime: 35.4375
  eval_samples_per_second: 140.7830
  eval_steps_per_second: 8.8040
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.7092
  eval_accuracy: 0.7830
  eval_f1_macro: 0.3952
  eval_f1_weighted: 0.7722
  eval_runtime: 8.9310
  eval_samples_per_second: 139.7380
  eval_steps_per_second: 8.7340
  epoch: 3.0000
✅ small_hf_wordpiece_hf_POS training completed
📌 Logging results to CSV

🔧 Running: small_sp_unigram_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalsmall/sp_unigram_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.8664,1.24919,0.616414,0.214627,0.585714
2,1.0757,0.926874,0.711538,0.288657,0.69737
3,0.8064,0.824974,0.748409,0.329877,0.734043



📊 Evaluation Results:
  eval_loss: 1.2492
  eval_accuracy: 0.6164
  eval_f1_macro: 0.2146
  eval_f1_weighted: 0.5857
  eval_runtime: 8.8669
  eval_samples_per_second: 140.7480
  eval_steps_per_second: 8.7970
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 0.9269
  eval_accuracy: 0.7115
  eval_f1_macro: 0.2887
  eval_f1_weighted: 0.6974
  eval_runtime: 8.8955
  eval_samples_per_second: 140.2960
  eval_steps_per_second: 8.7680
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.8250
  eval_accuracy: 0.7484
  eval_f1_macro: 0.3299
  eval_f1_weighted: 0.7340
  eval_runtime: 8.9152
  eval_samples_per_second: 139.9850
  eval_steps_per_second: 8.7490
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.6272
  eval_accuracy: 0.8067
  eval_f1_macro: 0.3209
  eval_f1_weighted: 0.7947
  eval_runtime: 35.5970
  eval_samples_per_second: 140.1520
  eval_steps_per_second: 8.7650
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.8250
  eval_accuracy: 0.7484
  eval_f1_macro: 0.3299
  eval_f1_weighted: 0.7340
  eval_runtime: 9.0279
  eval_samples_per_second: 138.2380
  eval_steps_per_second: 8.6400
  epoch: 3.0000
✅ small_sp_unigram_hf_POS training completed
📌 Logging results to CSV

🔧 Running: medium_hf_bpe_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalmedium/hf_bpe_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.8776,1.355067,0.586857,0.201274,0.558501
2,1.1951,1.079542,0.668813,0.256299,0.644675
3,0.9652,1.007952,0.69316,0.272376,0.673517



📊 Evaluation Results:
  eval_loss: 1.3551
  eval_accuracy: 0.5869
  eval_f1_macro: 0.2013
  eval_f1_weighted: 0.5585
  eval_runtime: 8.8558
  eval_samples_per_second: 140.9240
  eval_steps_per_second: 8.8080
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 1.0795
  eval_accuracy: 0.6688
  eval_f1_macro: 0.2563
  eval_f1_weighted: 0.6447
  eval_runtime: 8.8317
  eval_samples_per_second: 141.3100
  eval_steps_per_second: 8.8320
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 1.0080
  eval_accuracy: 0.6932
  eval_f1_macro: 0.2724
  eval_f1_weighted: 0.6735
  eval_runtime: 8.9631
  eval_samples_per_second: 139.2380
  eval_steps_per_second: 8.7020
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.8039
  eval_accuracy: 0.7526
  eval_f1_macro: 0.2686
  eval_f1_weighted: 0.7366
  eval_runtime: 35.3578
  eval_samples_per_second: 141.1000
  eval_steps_per_second: 8.8240
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 1.0080
  eval_accuracy: 0.6932
  eval_f1_macro: 0.2724
  eval_f1_weighted: 0.6735
  eval_runtime: 8.8234
  eval_samples_per_second: 141.4420
  eval_steps_per_second: 8.8400
  epoch: 3.0000
✅ medium_hf_bpe_hf_POS training completed
📌 Logging results to CSV

🔧 Running: medium_hf_wordpiece_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalmedium/hf_wordpiece_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.8361,1.266284,0.617625,0.209012,0.5845
2,1.0842,0.973385,0.706892,0.27852,0.691141
3,0.8423,0.893388,0.732484,0.291479,0.717426



📊 Evaluation Results:
  eval_loss: 1.2663
  eval_accuracy: 0.6176
  eval_f1_macro: 0.2090
  eval_f1_weighted: 0.5845
  eval_runtime: 8.8677
  eval_samples_per_second: 140.7350
  eval_steps_per_second: 8.7960
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 0.9734
  eval_accuracy: 0.7069
  eval_f1_macro: 0.2785
  eval_f1_weighted: 0.6911
  eval_runtime: 8.9436
  eval_samples_per_second: 139.5410
  eval_steps_per_second: 8.7210
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.8934
  eval_accuracy: 0.7325
  eval_f1_macro: 0.2915
  eval_f1_weighted: 0.7174
  eval_runtime: 8.7906
  eval_samples_per_second: 141.9690
  eval_steps_per_second: 8.8730
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.6659
  eval_accuracy: 0.7964
  eval_f1_macro: 0.2877
  eval_f1_weighted: 0.7839
  eval_runtime: 35.3469
  eval_samples_per_second: 141.1440
  eval_steps_per_second: 8.8270
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.8934
  eval_accuracy: 0.7325
  eval_f1_macro: 0.2915
  eval_f1_weighted: 0.7174
  eval_runtime: 8.9692
  eval_samples_per_second: 139.1430
  eval_steps_per_second: 8.6960
  epoch: 3.0000
✅ medium_hf_wordpiece_hf_POS training completed
📌 Logging results to CSV

🔧 Running: medium_sp_unigram_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finalmedium/sp_unigram_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.9171,1.345953,0.595759,0.193717,0.553467
2,1.1744,1.0155,0.692879,0.295584,0.672473
3,0.9093,0.927165,0.721081,0.321789,0.706057



📊 Evaluation Results:
  eval_loss: 1.3460
  eval_accuracy: 0.5958
  eval_f1_macro: 0.1937
  eval_f1_weighted: 0.5535
  eval_runtime: 8.8679
  eval_samples_per_second: 140.7320
  eval_steps_per_second: 8.7960
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 1.0155
  eval_accuracy: 0.6929
  eval_f1_macro: 0.2956
  eval_f1_weighted: 0.6725
  eval_runtime: 8.9353
  eval_samples_per_second: 139.6700
  eval_steps_per_second: 8.7290
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.9272
  eval_accuracy: 0.7211
  eval_f1_macro: 0.3218
  eval_f1_weighted: 0.7061
  eval_runtime: 8.9698
  eval_samples_per_second: 139.1330
  eval_steps_per_second: 8.6960
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.7375
  eval_accuracy: 0.7752
  eval_f1_macro: 0.3141
  eval_f1_weighted: 0.7625
  eval_runtime: 35.7799
  eval_samples_per_second: 139.4360
  eval_steps_per_second: 8.7200
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.9272
  eval_accuracy: 0.7211
  eval_f1_macro: 0.3218
  eval_f1_weighted: 0.7061
  eval_runtime: 9.0352
  eval_samples_per_second: 138.1260
  eval_steps_per_second: 8.6330
  epoch: 3.0000
✅ medium_sp_unigram_hf_POS training completed
📌 Logging results to CSV

🔧 Running: large_hf_bpe_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finallarge/hf_bpe_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.8638,1.31791,0.600021,0.214637,0.567274
2,1.1631,1.060182,0.675103,0.25796,0.648381
3,0.9385,0.989401,0.701466,0.277514,0.681194



📊 Evaluation Results:
  eval_loss: 1.3179
  eval_accuracy: 0.6000
  eval_f1_macro: 0.2146
  eval_f1_weighted: 0.5673
  eval_runtime: 8.8814
  eval_samples_per_second: 140.5180
  eval_steps_per_second: 8.7820
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 1.0602
  eval_accuracy: 0.6751
  eval_f1_macro: 0.2580
  eval_f1_weighted: 0.6484
  eval_runtime: 8.8460
  eval_samples_per_second: 141.0800
  eval_steps_per_second: 8.8180
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.9894
  eval_accuracy: 0.7015
  eval_f1_macro: 0.2775
  eval_f1_weighted: 0.6812
  eval_runtime: 8.8327
  eval_samples_per_second: 141.2920
  eval_steps_per_second: 8.8310
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.7718
  eval_accuracy: 0.7643
  eval_f1_macro: 0.2719
  eval_f1_weighted: 0.7484
  eval_runtime: 35.0883
  eval_samples_per_second: 142.1840
  eval_steps_per_second: 8.8920
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.9894
  eval_accuracy: 0.7015
  eval_f1_macro: 0.2775
  eval_f1_weighted: 0.6812
  eval_runtime: 8.8094
  eval_samples_per_second: 141.6670
  eval_steps_per_second: 8.8540
  epoch: 3.0000
✅ large_hf_bpe_hf_POS training completed
📌 Logging results to CSV

🔧 Running: large_hf_wordpiece_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finallarge/hf_wordpiece_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.8463,1.303553,0.605035,0.210884,0.569007
2,1.1256,0.986396,0.703427,0.271906,0.683168
3,0.8542,0.890353,0.733462,0.288317,0.718666



📊 Evaluation Results:
  eval_loss: 1.3036
  eval_accuracy: 0.6050
  eval_f1_macro: 0.2109
  eval_f1_weighted: 0.5690
  eval_runtime: 8.8610
  eval_samples_per_second: 140.8420
  eval_steps_per_second: 8.8030
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 0.9864
  eval_accuracy: 0.7034
  eval_f1_macro: 0.2719
  eval_f1_weighted: 0.6832
  eval_runtime: 8.7458
  eval_samples_per_second: 142.6970
  eval_steps_per_second: 8.9190
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.8904
  eval_accuracy: 0.7335
  eval_f1_macro: 0.2883
  eval_f1_weighted: 0.7187
  eval_runtime: 8.7755
  eval_samples_per_second: 142.2140
  eval_steps_per_second: 8.8880
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.6639
  eval_accuracy: 0.8001
  eval_f1_macro: 0.2848
  eval_f1_weighted: 0.7880
  eval_runtime: 35.3254
  eval_samples_per_second: 141.2300
  eval_steps_per_second: 8.8320
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.8904
  eval_accuracy: 0.7335
  eval_f1_macro: 0.2883
  eval_f1_weighted: 0.7187
  eval_runtime: 8.8154
  eval_samples_per_second: 141.5700
  eval_steps_per_second: 8.8480
  epoch: 3.0000
✅ large_hf_wordpiece_hf_POS training completed
📌 Logging results to CSV

🔧 Running: large_sp_unigram_hf_POS
🔍 Loading tokenizer from: /content/drive/MyDrive/Tokenizer_New/vocab_final/vocab_finallarge/sp_unigram_hf


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.9212,1.372876,0.585726,0.192165,0.545082
2,1.1877,1.029719,0.686197,0.276444,0.661232
3,0.9326,0.94998,0.7107,0.314127,0.694399



📊 Evaluation Results:
  eval_loss: 1.3729
  eval_accuracy: 0.5857
  eval_f1_macro: 0.1922
  eval_f1_weighted: 0.5451
  eval_runtime: 8.8756
  eval_samples_per_second: 140.6100
  eval_steps_per_second: 8.7880
  epoch: 1.0000

📊 Evaluation Results:
  eval_loss: 1.0297
  eval_accuracy: 0.6862
  eval_f1_macro: 0.2764
  eval_f1_weighted: 0.6612
  eval_runtime: 8.9084
  eval_samples_per_second: 140.0920
  eval_steps_per_second: 8.7560
  epoch: 2.0000

📊 Evaluation Results:
  eval_loss: 0.9500
  eval_accuracy: 0.7107
  eval_f1_macro: 0.3141
  eval_f1_weighted: 0.6944
  eval_runtime: 8.8935
  eval_samples_per_second: 140.3260
  eval_steps_per_second: 8.7700
  epoch: 3.0000



📊 Evaluation Results:
  eval_loss: 0.7654
  eval_accuracy: 0.7662
  eval_f1_macro: 0.3085
  eval_f1_weighted: 0.7528
  eval_runtime: 35.6033
  eval_samples_per_second: 140.1270
  eval_steps_per_second: 8.7630
  epoch: 3.0000

📊 Evaluation Results:
  eval_loss: 0.9500
  eval_accuracy: 0.7107
  eval_f1_macro: 0.3141
  eval_f1_weighted: 0.6944
  eval_runtime: 9.0682
  eval_samples_per_second: 137.6240
  eval_steps_per_second: 8.6020
  epoch: 3.0000
✅ large_sp_unigram_hf_POS training completed
📌 Logging results to CSV
