# Pre-process Corpus

In [None]:
import unicodedata
def is_chinese_char(cp):
    """Checks whether CP is the codepoint of a CJK character based on BERT rules."""
    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  
            (cp >= 0x3400 and cp <= 0x4DBF) or  
            (cp >= 0x20000 and cp <= 0x2A6DF) or  
            (cp >= 0x2A700 and cp <= 0x2B73F) or  
            (cp >= 0x2B740 and cp <= 0x2B81F) or  
            (cp >= 0x2B820 and cp <= 0x2CEAF) or
            (cp >= 0xF900 and cp <= 0xFAFF) or  
            (cp >= 0x2F800 and cp <= 0x2FA1F)):  
        return True
    return False

def clean_and_save(input_path, output_path):
    print("Cleaning corpus with CJK handling...")
    with open(input_path, 'r', encoding='utf-8') as f_in, \
         open(output_path, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            # 1. CJK Spacing: Wrap Chinese chars in spaces
            chars = []
            for char in line:
                cp = ord(char)
                if is_chinese_char(cp):
                    chars.append(f" {char} ")
                else:
                    chars.append(char)
            line = "".join(chars)

            # 2. Normalize Unicode (NFD)
            line = unicodedata.normalize('NFD', line)
            
            # 3. Strip Accents
            line = "".join([c for c in line if not unicodedata.combining(c)])
            
            # 4. Lowercase and clean up resulting double-spaces
            # Using .split() and .join() keeps exactly one space between words
            final_line = " ".join(line.lower().split())
            f_out.write(final_line + "\n")

    print("Clean corpus saved! Ready for training.")

clean_and_save("./corpus/wiki_corpus.txt", "./corpus/wiki_corpus_clean.txt")

### Mount for Google Collab

In [None]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')
source_path = '/content/drive/MyDrive/Colab Notebooks/TokenFilter'

if os.path.exists(source_path):
    # This command copies the *contents* of TokenFilter to the current folder (.)
    !cp -r "{source_path}"/* .
    print("Success! All files copied.")
else:
    print(f"Error: Could not find path {source_path}. Check if the folder name is correct.")

# 5. Create any output directories your code expects if they weren't in the copy
os.makedirs("models", exist_ok=True)
os.makedirs("bert_tokenizer_uncased", exist_ok=True)
os.makedirs("filtered_bert_tokenizer", exist_ok=True)

print("Copy complete! Directory structure:")
!ls -R ./models

## Generate Word-Piece Tokenizer From Corpus

In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from transformers import PreTrainedTokenizerFast

print("Initializing tokenizer...")
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Configure trainer
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=30000,
    min_frequency=2,
    special_tokens=special_tokens

)

# Train tokenizer
print("Training tokenizer...")

# Pass the file
tokenizer.train(["./corpus/wiki_corpus_clean.txt"], trainer)

tokenizer.normalizer = normalizers.BertNormalizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

print("Tokenizer vocab size:", tokenizer.vocab_size)
# Save for reuse like AutoTokenizer

tokenizer.save_pretrained("./bert_tokenizer_uncased")


  from .autonotebook import tqdm as notebook_tqdm


Initializing tokenizer...
Training tokenizer...



Tokenizer vocab size: 30000


('./bert_tokenizer_uncased/tokenizer_config.json',
 './bert_tokenizer_uncased/tokenizer.json')

## Load Saved Tokenizer

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./filtered_bert_tokenizer")

## Generate Word-Piece Tokenizer From JSON

In [None]:
filtered_tokenizer_path = "filtered_tokenizer_vocab_06_05.json"

In [None]:
import json
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast

# Load filtered vocab

with open(filtered_tokenizer_path, "r", encoding="utf-8") as f:
    token_list = json.load(f)  

# Convert list to dict {token: id}
vocab_dict = {token: idx for idx, token in enumerate(token_list)}

# Initialize WordPiece tokenizer with your vocab
tokenizer = Tokenizer(WordPiece(vocab=vocab_dict, unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Wrap in PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenizer.normalizer = normalizers.BertNormalizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True
)

print("Tokenizer vocab size:", tokenizer.vocab_size)

tokenizer.save_pretrained("./filtered_bert_tokenizer")


Tokenizer vocab size: 29709


('./filtered_bert_tokenizer/tokenizer_config.json',
 './filtered_bert_tokenizer/tokenizer.json')

### Testing the tokenizer

In [4]:
print(tokenizer.encode("vertical"))

[12823]


# Define The Model

In [None]:

from transformers import BertConfig, BertForMaskedLM
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling

# new Model config
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=256, # Increased to 128 (64 is too short for real sentences)
    num_hidden_layers=8,        # 2 is too shallow; 4 allows abstraction
    num_attention_heads=8,      # More heads = better context understanding
    hidden_size=768,            # 64 -> 256 (Crucial for representational capacity)
    intermediate_size=3072,     # usually 4x hidden_size (Standard FFN ratio)
)

model = BertForMaskedLM(config)

# Define Dataset

After filtering tokens, we will define a pattern to map the corpus to replacement words

In [18]:
import re
with open("mapping_06_05.json", "r", encoding="utf-8") as f:
    word_map = json.load(f)
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, word_map.keys())) + r')\b')

In [6]:
pattern = None #reset pattern

In [None]:
# Dataset
dataset = load_dataset(
    "text",
    data_files={"train": "./corpus/wiki_corpus_clean.txt"},
)
def replace_tokens(text):
    # Check if 'pattern' exists in the namespace and is not None
    if 'pattern' not in globals() or pattern is None:
        return text
    
    return pattern.sub(lambda x: word_map[x.group(0)], text)

def tokenize(batch):
    processed_text = [replace_tokens(t) for t in batch["text"]]
    return tokenizer(
        processed_text,
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
    num_proc=4,
    load_from_cache_file=False
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

Generating train split: 1165029 examples [00:05, 212692.39 examples/s]
Map (num_proc=4): 100%|██████████| 1165029/1165029 [00:54<00:00, 21502.11 examples/s]


## Model Training

In [None]:
model = BertForMaskedLM(config) #Reset

In [None]:
from transformers import Trainer
from transformers import TrainingArguments


training_args = TrainingArguments(
    output_dir="./H100_BERT_Run",
    per_device_train_batch_size=512,  
    gradient_accumulation_steps=1, 
    learning_rate=1e-3,                
    weight_decay=0.01,
    num_train_epochs=5,                
    warmup_steps=500,                  
    
    bf16=True,                         
    tf32=True,                         
    torch_compile=True,              
    optim="adamw_torch_fused",        
    dataloader_num_workers=8,          #Test this
    dataloader_pin_memory=True,      
    
    # --- Logging & Saving ---
    logging_steps=100,                
    save_steps=10000,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,
)
print("Model Compiled")
train_results = trainer.train()

# Explicitly print the results object
print("\n--- Training Summary ---")
print(train_results)


Model Compiled


Step,Training Loss


KeyboardInterrupt: 

# Optional: Test Mask Model

In [14]:
from transformers import AutoModelForMaskedLM, pipeline
model = AutoModelForMaskedLM.from_pretrained("./models/BaseModel/checkpoint-27306/")
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
)
fill_mask("The capital of france is the center of the [MASK] .")

Loading weights: 100%|██████████| 106/106 [00:00<00:00, 685.04it/s, Materializing param=cls.predictions.transform.dense.weight]               


[{'score': 0.14827008545398712,
  'token': 5776,
  'token_str': 'city',
  'sequence': 'the capital of france is the center of the city .'},
 {'score': 0.12068881094455719,
  'token': 7861,
  'token_str': 'capital',
  'sequence': 'the capital of france is the center of the capital .'},
 {'score': 0.057559866458177567,
  'token': 9659,
  'token_str': 'economy',
  'sequence': 'the capital of france is the center of the economy .'},
 {'score': 0.0510672926902771,
  'token': 7872,
  'token_str': 'empire',
  'sequence': 'the capital of france is the center of the empire .'},
 {'score': 0.045801009982824326,
  'token': 8873,
  'token_str': 'republic',
  'sequence': 'the capital of france is the center of the republic .'}]

# Fine Tuning

In [24]:
from transformers import (AutoTokenizer)

model_to_fine_tune_path = "./models/06_05/checkpoint-27306"
fine_tuned_model_path = "./models/06_05_Classification"
tokenizer_path = "./filtered_bert_tokenizer_06_05"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [25]:
import re
with open("mapping_06_05.json", "r", encoding="utf-8") as f:
    word_map = json.load(f)
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, word_map.keys())) + r')\b')

In [None]:
pattern = None #No pattern

## Dataset

In [None]:
from datasets import load_dataset
from transformers import (
        DataCollatorWithPadding
)

# Load IMDB data
dataset = load_dataset('csv', data_files={
    'train': './corpus/imdb_train.csv',
})
def replace_tokens(text):
    # Check if 'pattern' exists in the namespace and is not None
    if 'pattern' not in globals() or pattern is None:
        return text
    
    # If it exists, perform the replacement
    return pattern.sub(lambda x: word_map[x.group(0)], text)

def tokenize_function(examples):
    processed_text = [replace_tokens(t) for t in examples["text"]]
    return tokenizer(
        processed_text, 
        truncation=True, 
        padding="max_length", 
        max_length=256
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map: 100%|██████████| 25000/25000 [00:16<00:00, 1523.28 examples/s]


## Fine tune Training

In [28]:
from transformers import (
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)


model = BertForSequenceClassification.from_pretrained(
    model_to_fine_tune_path,  #Model to be fine-tuned
    num_labels=2  # e.g., Positive and Negative
)

training_args = TrainingArguments(
    output_dir="./temp_dir",      # A path is still required by the API
    
    learning_rate=2e-5,               # Small LR to preserve pre-trained knowledge
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    bf16=True,                       
    torch_compile=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

print("Starting Fine-Tuning for Classification...")
# Capture the output of the train() method
train_results = trainer.train()

print("\n--- Training Summary ---")
print(train_results)


# Save the final classification model
model.save_pretrained(fine_tuned_model_path)

Loading weights: 100%|██████████| 101/101 [00:00<00:00, 631.38it/s, Materializing param=bert.encoder.layer.5.output.dense.weight]             
[1mBertForSequenceClassification LOAD REPORT[0m from: ./models/06_05/checkpoint-27306
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
bert.pooler.dense.bias                     | MISSING    | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 
bert.pooler.dense.weight                   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identic

Starting Fine-Tuning for Classification...


Step,Training Loss
500,0.577089
1000,0.449693
1500,0.394305
2000,0.34917
2500,0.320376
3000,0.289575
3500,0.257898
4000,0.234887
4500,0.198244
5000,0.182617


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.42it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.30it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.76it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.92it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.87it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.86it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  5.27it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.96it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.44it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.65it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.18it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.93it/s]
Writing model shards: 100%|██████████| 1/1 [00:0


--- Training Summary ---
TrainOutput(global_step=7820, training_loss=0.25697133730134697, metrics={'train_runtime': 383.6352, 'train_samples_per_second': 651.661, 'train_steps_per_second': 20.384, 'total_flos': 3682369920000000.0, 'train_loss': 0.25697133730134697, 'epoch': 10.0})


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.57it/s]


In [30]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    BertForSequenceClassification, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)

model_path = fine_tuned_model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = BertForSequenceClassification.from_pretrained(model_path)

dataset = load_dataset('csv', data_files={'test': './corpus/imdb_test.csv'})

def replace_tokens(text):
    # Check if 'pattern' exists in the namespace and is not None
    if 'pattern' not in globals() or pattern is None:
        return text
    
    # If it exists, perform the replacement
    return pattern.sub(lambda x: word_map[x.group(0)], text)

def tokenize_function(examples):
    processed_text = [replace_tokens(t) for t in examples["text"]]
    return tokenizer(
        processed_text,
        truncation=True, 
        padding="max_length", 
        max_length=128
    )


tokenized_test = dataset['test'].map(tokenize_function, batched=True)
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

eval_args = TrainingArguments(
    per_device_eval_batch_size=32,
    bf16=True,            
    torch_compile=True,    
    report_to="none"      
)

trainer = Trainer(
    model=model,
    args=eval_args,
    train_dataset=None,
    eval_dataset=tokenized_test,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Running evaluation on test data...")
results = trainer.evaluate()

print("\n--- Evaluation Metrics ---")
print(f"Accuracy:  {results['eval_accuracy']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"F1 Score:  {results['eval_f1']:.4f}")
print(f"Recall:    {results['eval_recall']:.4f}")

Loading weights: 100%|██████████| 105/105 [00:00<00:00, 612.49it/s, Materializing param=classifier.weight]                                    
Map: 100%|██████████| 25000/25000 [00:12<00:00, 2003.54 examples/s]


Running evaluation on test data...



--- Evaluation Metrics ---
Accuracy:  0.8283
Precision: 0.8055
F1 Score:  0.8344
Recall:    0.8655
