In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split


RANDOM_SEED = 69
TOKEN_LIMIT = 400
TORCH_SEED = 69

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
torch.manual_seed(TORCH_SEED)

In [None]:
data = pd.read_csv('pretraining_dataset.csv')

dataset = Dataset.from_pandas(data)

In [None]:
# Unzip expanded vocab BERT
!unzip -q '/content/drive/MyDrive/Capstone backup/expanded_vocab_bert.zip' -d ./

In [None]:
tokenizer = BertTokenizer.from_pretrained('./expanded_vocab_bert')

# To verify is tokenizer is expanded properly, else can ignore
test_slang = "gratz"
test_emoji = "✅"

slang_id = tokenizer.convert_tokens_to_ids(test_slang)
emoji_id = tokenizer.convert_tokens_to_ids(test_emoji)

print(f"Token ID for slang '{test_slang}': {slang_id}")
print(f"Token ID for emoji '{test_emoji}': {emoji_id}")

if slang_id == tokenizer.unk_token_id:
    print(f"Slang '{test_slang}' is not in the vocabulary.")
else:
    print(f"Slang '{test_slang}' is in the vocabulary.")

if emoji_id == tokenizer.unk_token_id:
    print(f"Emoji '{test_emoji}' is not in the vocabulary.")
else:
    print(f"Emoji '{test_emoji}' is in the vocabulary.")

In [None]:
def tokenize_function(data):
    return tokenizer(data['text'], truncation=True, max_length=TOKEN_LIMIT)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
df = tokenized_dataset.to_pandas()

train_df, eval_df = train_test_split(
    df,
    test_size=0.1,
    stratify=df['source'],
    random_state=RANDOM_SEED
)

train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

In [None]:
mlm_expanded_model = BertForMaskedLM.from_pretrained("./expanded_vocab_bert")

In [None]:
# To verify if embeddings are initialized properly

embedding_layer = mlm_expanded_model.bert.embeddings.word_embeddings

if slang_id != tokenizer.unk_token_id:
    slang_embedding = embedding_layer.weight.data[slang_id]
    print(f"Embedding for slang '{test_slang}': {slang_embedding}")

if emoji_id != tokenizer.unk_token_id:
    emoji_embedding = embedding_layer.weight.data[emoji_id]
    print(f"Embedding for emoji '{test_emoji}': {emoji_embedding}")

In [None]:
batch_size = 16
epochs = 4
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
training_args = TrainingArguments(
    output_dir="./mlm_expanded_model_results",
    evaluation_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=1000,
  )

trainer = Trainer(
    model=mlm_expanded_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

mlm_expanded_model.save_pretrained("./mlm_expanded_model")
tokenizer.save_pretrained("./mlm_expanded_model")

# Additional Training ( 2 more epochs )

In [None]:
batch_size = 16
epochs = 2
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
training_args = TrainingArguments(
    output_dir="./mlm_expanded_model_results",
    evaluation_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=1000,
  )

trainer = Trainer(
    model=mlm_expanded_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

mlm_expanded_model.save_pretrained("./mlm_expanded_additional_trg_model")
tokenizer.save_pretrained("./mlm_expanded_additional_trg_model")