In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import get_peft_model, LoraConfig 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3080 Laptop GPU


In [3]:
#load dataset
data = pd.read_csv("../data/politics_yt_comments.csv")  

# Extract the comments column
comments = data["comment"].astype(str)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(pd.DataFrame({"text": comments}))

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model = model.to("cuda") if torch.cuda.is_available() else model
save_path = "../fine_tuned_llms/bert_mlm_finetuned"

#might need to mess around with this to get good results
lora_config = LoraConfig(
    r=16,  # Low-rank dimension (you can tune this)
    lora_alpha=32,  # Scaling factor for LoRA
    lora_dropout=0.1,  # Dropout for LoRA layers
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model = model.to("cuda") if torch.cuda.is_available() else model

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Prepare the data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Set up training arguments
training_args = TrainingArguments(
    output_dir= save_path,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,
    #device="cuda" if torch.cuda.is_available() else "cpu"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

# Save the final model
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Step,Training Loss
