In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split


RANDOM_SEED = 69
TOKEN_LIMIT = 400

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
data = pd.read_csv('pretraining_dataset.csv')

dataset = Dataset.from_pandas(data)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(data):
    return tokenizer(data['text'], truncation=True, max_length=TOKEN_LIMIT)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
df = tokenized_dataset.to_pandas()

train_df, eval_df = train_test_split(
    df,
    test_size=0.1,
    stratify=df['source'],
    random_state=RANDOM_SEED
)

train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

In [None]:
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

In [None]:
batch_size = 16
epochs = 4
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
training_args = TrainingArguments(
    output_dir="./mlm_model_results",
    evaluation_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=5,
    logging_dir="./logs",
    report_to="none",
    logging_steps=1000,
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained("./mlm_model")
tokenizer.save_pretrained("./mlm_model")