In [1]:
import wandb
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from dotenv import load_dotenv
import os

In [2]:
def info(message):
  print("="*30+f"[INFO] {message}"+"="*30)

In [3]:
load_dotenv()
HF_READ_KEY=os.environ["HF_READ_KEY"]
DATASET_NAME=os.environ["DATASET_NAME"]
info("load dataset")
dataset=load_dataset(DATASET_NAME,token=HF_READ_KEY)
info("dataset loaded!")



In [4]:
# prepare configs
class Config:
  def __init__(self):
    self.model_name="answerdotai/ModernBERT-base"
    self.max_lenght=512
    self.new_vocab_size=64000
    self.mlm_probability=0.15 # Masked Language Proba (15% of input will be masked)
    self.base_dir="./DarijaModern"
    self.output_dir=self.base_dir+"/model"
    self.num_train_epochs=3
    self.per_device_train_batch_size=32
    self.per_device_eval_batch_size=8
    self.evaluation_strategy="steps"
    self.eval_steps=5000
    self.logging_steps=100
    self.save_steps=5000
    self.save_total_limit=2
    self.learning_rate=5e-2
    self.warmup_steps=500
    self.weight_decay=0.01
    self.report_to="wandb"
    self.run_name="modernbert-darija"
    self.overwrite_output_dir = True

In [5]:
configs=Config()

In [6]:
# train darija tokenizer
base_tokenizer=AutoTokenizer.from_pretrained(
    "answerdotai/ModernBERT-base",
    use_fast=True # Fast tokenizers are implemented in Rust and are significantly faster than the regular Python-based tokenizers.
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# dataset to iterator
def batch_iter(ds,batch_size=1000):
  for i in range(0,len(ds),batch_size):
    yield ds[i:i+batch_size]["text"]

In [None]:
info("train new Darija tokenizer")
train_iterator=batch_iter(dataset["train"].select(range(16_000)))
new_tokenizer=base_tokenizer.train_new_from_iterator(
    text_iterator=train_iterator,
    vocab_size=configs.new_vocab_size,
    show_progress=True
)
info("Save new tokenizer...")
new_tokenizer.save_pretrained(f"{configs.base_dir}/tokenizer")
info("new tokenizer saved...")



In [None]:
info("load new Darija tokenizer")
new_tokenizer=AutoTokenizer.from_pretrained(f"{configs.base_dir}/tokenizer",use_fast=True)

In [None]:
def process(examples):
  return new_tokenizer(
      examples["train"],
      turncation=True,
      max_length=configs.max_length)

In [None]:
info("tokenize train/test dataset...")
train_dataset=dataset["train"].map(
    process,
    batched=True,
    remove_columns=dataset["train"].column_names
)
test_dataset=dataset["test"].map(
    process,
    batched=True,
    remove_columns=dataset["test"].column_names
)
info("Done!")

In [None]:
info("init data collator...")
data_collator=DataCollatorForLanguageModeling(
    tokenizer=new_tokenizer,
    mlm=True,
    mlm_probability=configs.mlm_probability
)

In [None]:
info("load model...")
model=AutoModelForMaskedLM.pretrained(
    configs.model_name
)
info("Done!")

In [None]:
info("resize embedding matrix...")
model.resize_token_embeddings(configs.new_tokenizer_voca_size)
info("Done!")

In [None]:
info("init training args...")
training_args = TrainingArguments(
    output_dir=configs.output_dir,
    overwrite_output_dir=configs.overwrite_output_dir,
    num_train_epochs=configs.num_train_epochs,
    per_device_train_batch_size=configs.per_device_train_batch_size,
    per_device_eval_batch_size=configs.per_device_eval_batch_size,
    evaluation_strategy=configs.evaluation_strategy,
    eval_steps=configs.eval_steps,
    logging_steps=configs.logging_steps,
    save_steps=configs.save_steps,
    save_total_limit=configs.save_total_limit,
    learning_rate=configs.learning_rate,
    warmup_steps=configs.warmup_steps,
    weight_decay=configs.weight_decay,
    report_to=configs.report_to,
    run_name=configs.run_name,
)
info("Done!")

In [None]:
info("init trainer...")
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)
info("Done!")

In [None]:
info("trainer...")
trainer.train()

In [None]:
info("save result model...")
trainer.save_model(configs.output_dir)
new_tokenizer.save_pretrained(configs.output_dir)
info("push result model to hub...")
trainer.push_to_hub("atlasia/modern-bert-darija")