# Model Training

This notebook covers the process of training the tokenizer and the model.

In [1]:
!pip install evaluate
!pip install ninja
!pip install flash-attn --no-build-isolation
!pip install flashtext

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Collecting ninja
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.13.0
Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m109.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Buildin

# Pre-process Corpus

In [None]:
import unicodedata
def is_chinese_char(cp):
    """Checks whether CP is the codepoint of a CJK character based on BERT rules."""
    if ((cp >= 0x4E00 and cp <= 0x9FFF) or
            (cp >= 0x3400 and cp <= 0x4DBF) or
            (cp >= 0x20000 and cp <= 0x2A6DF) or
            (cp >= 0x2A700 and cp <= 0x2B73F) or
            (cp >= 0x2B740 and cp <= 0x2B81F) or
            (cp >= 0x2B820 and cp <= 0x2CEAF) or
            (cp >= 0xF900 and cp <= 0xFAFF) or
            (cp >= 0x2F800 and cp <= 0x2FA1F)):
        return True
    return False

def clean_and_save(input_path, output_path):
    print("Cleaning corpus with CJK handling...")
    with open(input_path, 'r', encoding='utf-8') as f_in, \
         open(output_path, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            # 1. CJK Spacing: Wrap Chinese chars in spaces
            chars = []
            for char in line:
                cp = ord(char)
                if is_chinese_char(cp):
                    chars.append(f" {char} ")
                else:
                    chars.append(char)
            line = "".join(chars)

            # 2. Normalize Unicode (NFD)
            line = unicodedata.normalize('NFD', line)

            # 3. Strip Accents
            line = "".join([c for c in line if not unicodedata.combining(c)])

            # 4. Lowercase and clean up resulting double-spaces
            # Using .split() and .join() keeps exactly one space between words
            final_line = " ".join(line.lower().split())
            f_out.write(final_line + "\n")

    print("Clean corpus saved! Ready for training.")

clean_and_save("./corpus/wiki_corpus.txt", "./corpus/wiki_corpus_clean.txt")

### Mount for Google Collab

In [2]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')
source_path = '/content/drive/MyDrive/Colab Notebooks/TokenFilter'

if os.path.exists(source_path):
    # This command copies the *contents* of TokenFilter to the current folder (.)
    !cp -r "{source_path}"/* .
    print("Success! All files copied.")
else:
    print(f"Error: Could not find path {source_path}. Check if the folder name is correct.")

# 5. Create any output directories your code expects if they weren't in the copy
os.makedirs("models", exist_ok=True)
os.makedirs("bert_tokenizer_uncased", exist_ok=True)
os.makedirs("filtered_bert_tokenizer", exist_ok=True)

print("Copy complete! Directory structure:")
!ls -R ./models

Mounted at /content/drive
Success! All files copied.
Copy complete! Directory structure:
./models:


## Generate Word-Piece Tokenizer From Corpus

In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from transformers import PreTrainedTokenizerFast

print("Initializing tokenizer...")
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Configure trainer
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=30000,
    min_frequency=2,
    special_tokens=special_tokens

)

# Train tokenizer
print("Training tokenizer...")

# Pass the file
tokenizer.train(["./corpus/wiki_corpus_clean.txt"], trainer)

tokenizer.normalizer = normalizers.BertNormalizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

print("Tokenizer vocab size:", tokenizer.vocab_size)
# Save for reuse like AutoTokenizer

tokenizer.save_pretrained("./bert_tokenizer_uncased")


## Load Saved Tokenizer

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./bert_tokenizer_uncased")

## Generate Word-Piece Tokenizer From JSON

In [7]:
current_model = "wordnet_v2"

In [8]:
filtered_tokenizer_path = current_model+"/filtered_tokenizer_vocab_"+current_model+".json"

In [10]:
import json
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast

# Load filtered vocab

with open(filtered_tokenizer_path, "r", encoding="utf-8") as f:
    token_list = json.load(f)

# Convert list to dict {token: id}
vocab_dict = {token: idx for idx, token in enumerate(token_list)}

# Initialize WordPiece tokenizer with your vocab
tokenizer = Tokenizer(WordPiece(vocab=vocab_dict, unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Wrap in PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenizer.normalizer = normalizers.BertNormalizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True
)

print("Tokenizer vocab size:", tokenizer.vocab_size)

tokenizer.save_pretrained("./"+current_model+"/filtered_bert_tokenizer")


Tokenizer vocab size: 26090


('./wordnet_v2/filtered_bert_tokenizer/tokenizer_config.json',
 './wordnet_v2/filtered_bert_tokenizer/tokenizer.json')

### Testing the tokenizer

In [11]:
print(tokenizer.encode("vertical"))

[11604]


# Define The Model

In [None]:

from transformers import BertConfig, BertForMaskedLM
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import torch

# new Model config
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=256,
    num_hidden_layers=8,
    num_attention_heads=8,
    hidden_size=512,
    intermediate_size=2048,
    attn_implementation="flash_attention_2"

)

# Initialize with random weights

model = BertForMaskedLM(config).to(torch.bfloat16).to("cuda")

# Alternative if using from_config (cleanest way to avoid warnings)
# model = BertForMaskedLM.from_config(config, torch_dtype=torch.bfloat16).to("cuda")

print(f"Model dtype: {model.dtype}")
print(f"Flash Attention active: {model.config._attn_implementation}")

You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


Model dtype: torch.bfloat16
Flash Attention active: flash_attention_2


## Smaller Model

In [12]:

from transformers import BertConfig, BertForMaskedLM
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import torch

# new Model config
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=256,
    num_hidden_layers=8,
    num_attention_heads=8,
    hidden_size=256,
    intermediate_size=1024,
    attn_implementation="flash_attention_2"

)

# Initialize with random weights

model = BertForMaskedLM(config).to(torch.bfloat16).to("cuda")

# Alternative if using from_config (cleanest way to avoid warnings)
# model = BertForMaskedLM.from_config(config, torch_dtype=torch.bfloat16).to("cuda")

print(f"Model dtype: {model.dtype}")
print(f"Flash Attention active: {model.config._attn_implementation}")

You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


Model dtype: torch.bfloat16
Flash Attention active: flash_attention_2


# Define Dataset

After filtering tokens, we will define a pattern to map the corpus to replacement words

In [13]:
# You may need to pip install flashtext
from flashtext import KeywordProcessor
import re
with open(current_model+"/removed_words_mapping_"+current_model+".json", "r", encoding="utf-8") as f:
    word_map = json.load(f)

# Initialize the processor
keyword_processor = KeywordProcessor(case_sensitive=True)

# Add your mapping
# keyword_processor.add_keyword('word_to_find', 'replacement_word')
for bad_word, good_word in word_map.items():
    keyword_processor.add_keyword(bad_word, good_word)

[keyword_processor.replace_keywords(t) for t in ["contender" , "advisers", "fellow"]]

['competitions', 'consultant', 'fellows']

In [None]:
keyword_processor = KeywordProcessor(case_sensitive=True)
[keyword_processor.replace_keywords(t) for t in ["hello" , "there", "growing"]]

NameError: name 'KeywordProcessor' is not defined

In [14]:

# Dataset
dataset = load_dataset(
    "text",
    data_files={"train": "./corpus/wiki_corpus_clean.txt"},
)

def tokenize(examples):
    processed_text = [keyword_processor.replace_keywords(t) for t in examples["text"]]

    return tokenizer(
        processed_text,#processed_text  or #examples[text]
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
    num_proc=16,
    load_from_cache_file=False
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)


Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=16):   0%|          | 0/1165029 [00:00<?, ? examples/s]

## Model Training

In [None]:
model = BertForMaskedLM(config) #Reset

In [None]:
from transformers import Trainer
from transformers import TrainingArguments

training_args = TrainingArguments(

    output_dir="./models/"+current_model+ "_Smaller",
    per_device_train_batch_size=512,
    gradient_accumulation_steps=1,
    learning_rate=5e-4,
    weight_decay=0.01,
    num_train_epochs=3,
    warmup_steps=500,

    bf16=True,
    tf32=True,#false for tpu, true for H100
    torch_compile=True,#false for tpu, true for H100
    optim="adamw_torch_fused", #adamw_torch_fused for H100
    dataloader_num_workers=8,
    dataloader_pin_memory=True, #false for tpu, true for H100

    # --- Logging & Saving ---
    logging_steps=100,
    save_steps=10000,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,

)
print("Model Compiled")
train_results = trainer.train()

# Explicitly print the results object
print("\n--- Training Summary ---")
print(train_results)


Model Compiled


W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0] Graph break from `Tensor.item()`, consider setting:
W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0]     torch._dynamo.config.capture_scalar_outputs = True
W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0] or:
W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0]     env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0] to include these operations in the captured graph.
W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0] 
W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0] Graph break: from user code at:
W0210 16:43:17.931000 561 torch/_dynamo/variables/tensor.py:1048] [22/0]   File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_flash_attention_utils.py", line 266, in torch_dynamo_resume_in__get_unpad_data_at_263
W0210 16:43:17.93100

Step,Training Loss
100,9.625328


# Optional: Test Mask Model

In [None]:
from transformers import AutoModelForMaskedLM, pipeline
model = AutoModelForMaskedLM.from_pretrained("./models/"+current_model+"_Smaller/checkpoint-6828/")
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
)
fill_mask("hello how are [MASK] ?")

Loading weights:   0%|          | 0/138 [00:00<?, ?it/s]

[{'score': 0.068359375,
  'token': 6004,
  'token_str': 'you',
  'sequence': 'hello how are you ?'},
 {'score': 0.03759765625,
  'token': 5528,
  'token_str': 'me',
  'sequence': 'hello how are me ?'},
 {'score': 0.0201416015625,
  'token': 6563,
  'token_str': 'love',
  'sequence': 'hello how are love ?'},
 {'score': 0.0201416015625,
  'token': 35,
  'token_str': '?',
  'sequence': 'hello how are ? ?'},
 {'score': 0.01220703125,
  'token': 5872,
  'token_str': 'like',
  'sequence': 'hello how are like ?'}]

# Fine Tuning

In [None]:
from transformers import (AutoTokenizer)

model_to_fine_tune_path = "./models/"+current_model+"_Smaller/checkpoint-6828"
fine_tuned_model_path = "./models/"+current_model+"_Classification_Smaller"
tokenizer_path = current_model + "/filtered_bert_tokenizer" # current_model + "/filtered_
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [None]:

from flashtext import KeywordProcessor
import re
with open(current_model+"/removed_words_mapping_"+current_model+".json", "r", encoding="utf-8") as f:
    word_map = json.load(f)

In [None]:
print(current_model)

wordnet_v2


In [None]:
print(len(tokenizer.get_vocab()))

26090


In [None]:


# Initialize the processor
keyword_processor = KeywordProcessor(case_sensitive=True)

# Add your mapping
# keyword_processor.add_keyword('word_to_find', 'replacement_word')
for bad_word, good_word in word_map.items():
    keyword_processor.add_keyword(bad_word, good_word)

[keyword_processor.replace_keywords(t) for t in ["hello" , "theres", "know", "file"]]

['hello', 'there', 'knows', 'file']

## Dataset

In [None]:
from datasets import load_dataset
from transformers import (DataCollatorWithPadding)
# Load IMDB data
dataset = load_dataset('csv', data_files={
    'train': './corpus/imdb_train.csv',
})

def tokenize_function(examples):
    processed_text = [keyword_processor.replace_keywords(t) for t in examples["text"]]
    return tokenizer(

        processed_text,#processed_text
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## Fine tune Training

In [None]:
import torch

# Enforce the new API for TF32
torch.backends.cuda.matmul.allow_tf32 = True  # Allow TF32 on Ampere+ GPUs
torch.backends.cudnn.allow_tf32 = True

  self.setter(val)


In [None]:
from transformers import (
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

model = BertForSequenceClassification.from_pretrained(
    model_to_fine_tune_path,  #Model to be fine-tuned
    num_labels=2  # e.g., Positive and Negative
)

training_args = TrainingArguments(
    output_dir="./temp_dir",      # A path is still required by the API

    learning_rate=2e-4,               # Small LR to preserve pre-trained knowledge
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    weight_decay=0.01,
    bf16=True,
    tf32=True,
    torch_compile=True,
    logging_steps=100,
    save_steps=10000,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

print("Starting Fine-Tuning for Classification...")
# Capture the output of the train() method
train_results = trainer.train()

print("\n--- Training Summary ---")
print(train_results)


# Save the final classification model
model.save_pretrained(fine_tuned_model_path)

Loading weights:   0%|          | 0/133 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: ./models/wordnet_v2_Smaller/checkpoint-6828
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
bert.pooler.dense.weight                   | MISSING    | 
bert.pooler.dense.bias                     | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Starting Fine-Tuning for Classification...


Online softmax is disabled on the fly since Inductor decides to
split the reduction. Cut an issue to PyTorch if this is an
important use case and you want to speed it up with online
softmax.



Step,Training Loss
100,0.533175
200,0.397918
300,0.34728
400,0.338604
500,0.282507
600,0.278958
700,0.227597
800,0.227763
900,0.184267
1000,0.171424


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


--- Training Summary ---
TrainOutput(global_step=1960, training_loss=0.2069120122461903, metrics={'train_runtime': 111.98, 'train_samples_per_second': 2232.541, 'train_steps_per_second': 17.503, 'total_flos': 2451800832000000.0, 'train_loss': 0.2069120122461903, 'epoch': 10.0})


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    BertForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

model_path = fine_tuned_model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = BertForSequenceClassification.from_pretrained(model_path)

dataset = load_dataset('csv', data_files={'test': './corpus/imdb_test.csv'})


def tokenize_function(examples):
    #processed_text = [keyword_processor.replace_keywords(t) for t in examples["text"]]
    return tokenizer(
        examples["text"],#processed_text
        truncation=True,
        padding="max_length",
        max_length=256
    )


tokenized_test = dataset['test'].map(tokenize_function, batched=True)
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

eval_args = TrainingArguments(
    per_device_eval_batch_size=128,
    bf16=True,
    tf32=True,
    torch_compile=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=eval_args,
    train_dataset=None,
    eval_dataset=tokenized_test,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Running evaluation on test data...")
results = trainer.evaluate()

print("\n--- Evaluation Metrics ---")
print(f"Accuracy:  {results['eval_accuracy']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"F1 Score:  {results['eval_f1']:.4f}")
print(f"Recall:    {results['eval_recall']:.4f}")

Loading weights:   0%|          | 0/137 [00:00<?, ?it/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Running evaluation on test data...


Online softmax is disabled on the fly since Inductor decides to
split the reduction. Cut an issue to PyTorch if this is an
important use case and you want to speed it up with online
softmax.




--- Evaluation Metrics ---
Accuracy:  0.8221
Precision: 0.8358
F1 Score:  0.8184
Recall:    0.8016


## Saving Data to Drive

In [None]:
import os

drive_path = '/content/drive/MyDrive/Colab Notebooks/TokenFilter/models_save'

# Create the destination directory in Google Drive if it doesn't exist
os.makedirs(drive_path, exist_ok=True)

# Copy the BaseModel_uncased_H100 directory
print(f"Copying ./models/BaseModel_uncased_H100 to {drive_path}...")
!cp -r "./Models_To_Save/" "{drive_path}"

print("Models successfully copied to Google Drive!")

Copying ./models/BaseModel_uncased_H100 to /content/drive/MyDrive/Colab Notebooks/TokenFilter/models_save...
Models successfully copied to Google Drive!
