## Fill Mask

In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
import time
import random

  from .autonotebook import tqdm as notebook_tqdm


### Dataset Loading

In [12]:
def filter_null_rows(example):
    '''Checking and removing examples with None values in 'text' or 'label'.'''
    return example['text'] is not None 

# Preparing the dataset
dataset_path = "benjaminbeilharz/better_daily_dialog"
dataset = load_dataset(dataset_path, split={'train': 'train[:40%]', 'validation': 'validation[:40%]', 'test': 'test[:40%]'})
dataset = dataset.remove_columns(['dialog_id', 'turn_type', 'emotion']).rename_column("utterance", "text").filter(filter_null_rows)

# Split into train, validation and test
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

Filter: 100%|██████████| 34868/34868 [00:00<00:00, 445654.87 examples/s]
Filter: 100%|██████████| 3228/3228 [00:00<00:00, 206579.39 examples/s]
Filter: 100%|██████████| 3096/3096 [00:00<00:00, 229894.05 examples/s]


### Fine-Tuning

In [13]:
def Training_MaskedLM(model_name, dataset_path, train, val):

    def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    set_seed(123)# Set the seed to ensure reproducibility

    # Load the pre-trained tokenizer and model for sequence classification
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    
    def tokenizer_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=50)

    tok_train = train.map(tokenizer_function, batched=True)
    tok_val = val.map(tokenizer_function, batched=True)

    training_args = TrainingArguments(
        seed=123,
        data_seed=123,
        output_dir=f"./results_{model_name.split('/')[1]}_{dataset_path.split('/')[1]}", # Output directory for results
        evaluation_strategy='epoch', # Evaluate the model at the end of each epoch
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01) # Weight decay for regularization
    
    # This data collator will be used during training to dynamically mask 
    # tokens in the input text for the language modeling task.
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,  # The tokenizer used for processing the text data
        mlm=True,  # Enable masked language modeling (MLM)
        mlm_probability=0.1)  # Probability of masking tokens in the input text
    
    # Initialize the Trainer with the model, training arguments, and datasets
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tok_train,
        eval_dataset=tok_val,
        data_collator=data_collator)
    
    trainer.train()

    # Saving the trained model and tokenizer to the specified directory
    model.save_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")
    tokenizer.save_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")

In [14]:
Training_MaskedLM("squeezebert/squeezebert-uncased", "benjaminbeilharz/better_daily_dialog", train_dataset, val_dataset)

Map: 100%|██████████| 34868/34868 [00:02<00:00, 14960.14 examples/s]
Map: 100%|██████████| 3228/3228 [00:00<00:00, 14220.14 examples/s]
 11%|█▏        | 500/4360 [00:55<06:36,  9.74it/s]

{'loss': 1.8954, 'grad_norm': 12.67947006225586, 'learning_rate': 1.7706422018348625e-05, 'epoch': 0.23}


 23%|██▎       | 1000/4360 [01:49<06:00,  9.32it/s]

{'loss': 1.8514, 'grad_norm': 13.97727108001709, 'learning_rate': 1.541284403669725e-05, 'epoch': 0.46}


 34%|███▍      | 1500/4360 [02:48<04:58,  9.57it/s]  

{'loss': 1.8193, 'grad_norm': 16.585166931152344, 'learning_rate': 1.3119266055045871e-05, 'epoch': 0.69}


 46%|████▌     | 2000/4360 [03:42<04:11,  9.38it/s]

{'loss': 1.7916, 'grad_norm': 14.549799919128418, 'learning_rate': 1.0825688073394496e-05, 'epoch': 0.92}


  return F.conv1d(input, weight, bias, self.stride,
                                                   
 50%|█████     | 2181/4360 [04:09<47:32,  1.31s/it]

{'eval_loss': 1.8743982315063477, 'eval_runtime': 6.2848, 'eval_samples_per_second': 513.624, 'eval_steps_per_second': 32.141, 'epoch': 1.0}


 57%|█████▋    | 2500/4360 [04:43<03:13,  9.63it/s]

{'loss': 1.7934, 'grad_norm': 14.174108505249023, 'learning_rate': 8.53211009174312e-06, 'epoch': 1.15}


 69%|██████▉   | 3000/4360 [05:37<02:23,  9.50it/s]

{'loss': 1.746, 'grad_norm': 24.039159774780273, 'learning_rate': 6.238532110091744e-06, 'epoch': 1.38}


 80%|████████  | 3500/4360 [06:32<01:39,  8.64it/s]

{'loss': 1.7379, 'grad_norm': 15.136982917785645, 'learning_rate': 3.944954128440367e-06, 'epoch': 1.61}


 92%|█████████▏| 4000/4360 [07:27<00:38,  9.25it/s]

{'loss': 1.7277, 'grad_norm': 15.463316917419434, 'learning_rate': 1.6513761467889911e-06, 'epoch': 1.83}


                                                   
100%|██████████| 4360/4360 [08:12<00:00,  8.85it/s]


{'eval_loss': 1.8099689483642578, 'eval_runtime': 5.8849, 'eval_samples_per_second': 548.519, 'eval_steps_per_second': 34.325, 'epoch': 2.0}
{'train_runtime': 492.5045, 'train_samples_per_second': 141.595, 'train_steps_per_second': 8.853, 'train_loss': 1.7881281301515912, 'epoch': 2.0}


In [15]:
Training_MaskedLM("google-bert/bert-base-uncased", "benjaminbeilharz/better_daily_dialog", train_dataset, val_dataset)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 34868/34868 [00:02<00:00, 15699.60 examples/s]
Map: 100%|██████████| 3228/3228 [00:00<00:00, 14632.81 examples/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
 11%|█▏        | 500/4360 [00:54<06:53,  9.35it/s]

{'loss': 1.7819, 'grad_norm': 17.237581253051758, 'learning_rate': 1.7706422018348625e-05, 'epoch': 0.23}


 23%|██▎       | 1000/4360 [01:50<06:07,  9.14it/s]

{'loss': 1.6758, 'grad_norm': 21.197031021118164, 'learning_rate': 1.541284403669725e-05, 'epoch': 0.46}


 34%|███▍      | 1500/4360 [02:46<05:11,  9.19it/s]

{'loss': 1.617, 'grad_norm': 22.381107330322266, 'learning_rate': 1.3119266055045871e-05, 'epoch': 0.69}


 46%|████▌     | 2000/4360 [03:41<04:18,  9.14it/s]

{'loss': 1.5747, 'grad_norm': 23.087764739990234, 'learning_rate': 1.0825688073394496e-05, 'epoch': 0.92}


 50%|████▉     | 2179/4360 [04:02<03:57,  9.17it/s]
 50%|█████     | 2181/4360 [04:09<58:07,  1.60s/it]

{'eval_loss': 1.7305535078048706, 'eval_runtime': 6.4957, 'eval_samples_per_second': 496.943, 'eval_steps_per_second': 31.097, 'epoch': 1.0}


 57%|█████▋    | 2500/4360 [04:44<03:20,  9.27it/s]

{'loss': 1.5658, 'grad_norm': 17.855167388916016, 'learning_rate': 8.53211009174312e-06, 'epoch': 1.15}


 69%|██████▉   | 3000/4360 [05:39<02:28,  9.15it/s]

{'loss': 1.5038, 'grad_norm': 34.42122268676758, 'learning_rate': 6.238532110091744e-06, 'epoch': 1.38}


 80%|████████  | 3500/4360 [06:35<01:33,  9.17it/s]

{'loss': 1.497, 'grad_norm': 16.468236923217773, 'learning_rate': 3.944954128440367e-06, 'epoch': 1.61}


 92%|█████████▏| 4000/4360 [07:31<00:39,  9.15it/s]

{'loss': 1.5248, 'grad_norm': 18.569360733032227, 'learning_rate': 1.6513761467889911e-06, 'epoch': 1.83}


100%|█████████▉| 4359/4360 [08:12<00:00,  9.16it/s]
100%|██████████| 4360/4360 [08:19<00:00,  8.73it/s]


{'eval_loss': 1.5895793437957764, 'eval_runtime': 6.516, 'eval_samples_per_second': 495.392, 'eval_steps_per_second': 31.0, 'epoch': 2.0}
{'train_runtime': 499.3428, 'train_samples_per_second': 139.656, 'train_steps_per_second': 8.731, 'train_loss': 1.5835528872428684, 'epoch': 2.0}


### Testing

In [16]:
def Testing_MaskedLM(model_name, dataset_path, test_dataset):

    def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    set_seed(123)# Set the seed to ensure reproducibility
    
    # Load the trained model and tokenizer
    model = AutoModelForMaskedLM.from_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")
    tokenizer = AutoTokenizer.from_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")
    
    def tokenizer_function(examples):
        '''Tokenizer function to preprocess the text data'''
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=50)

    tok_test = test_dataset.map(tokenizer_function, batched=True, remove_columns=["text"])

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.1)
    test_dataloader = DataLoader(tok_test, batch_size=16, collate_fn=data_collator, num_workers=4)

    def cosine_similarity(emb1, emb2):
        '''Computing cosine similarity'''
        return torch.nn.functional.cosine_similarity(emb1, emb2, dim=-1).mean().item()

    model.eval() # Set the model to evaluation mode
    model.to('cpu') # Move the model to CPU (change to 'cuda' if GPU is available)

    all_preds = []
    all_labels = []

    start_time = time.time()

    # Loop over batches in the test dataloader
    for batch in test_dataloader:
        # Move input tensors to the same device as the model
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        # Perform inference without tracking gradients
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

        # Get the predicted token ids
        preds = logits.argmax(dim=-1)

        # Store the predictions and true labels
        all_preds.append(preds.cpu())
        all_labels.append(labels.cpu())

    end_time = time.time()
    runtime = end_time - start_time

    # Concatenate all predictions and labels into single tensors
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    # Create a mask to filter out padding tokens from predictions and labels
    mask = all_labels != -100
    filtered_preds = all_preds[mask]
    filtered_labels = all_labels[mask]

    acc = accuracy_score(filtered_labels.numpy(), filtered_preds.numpy())

    # Get embeddings for the predicted and true tokens
    pred_embeddings = model.get_input_embeddings()(filtered_preds.to(model.device))
    label_embeddings = model.get_input_embeddings()(filtered_labels.to(model.device))

    cosine_similarities = []
    for pred_emb, label_emb in zip(pred_embeddings, label_embeddings):
        cosine_similarities.append(cosine_similarity(pred_emb.unsqueeze(0), label_emb.unsqueeze(0)))

    avg_cosine_similarity = np.mean(cosine_similarities)

    metrics = {
        "accuracy": acc,
        "avg_cosine_similarity": avg_cosine_similarity,
        "runtime_seconds": runtime
    }

    return metrics
#Usually, in the other tasks we call the trainer object and the .predict() method to test the model.
#Thus, this type of method raised errors for this task, so we have implemented the testing phase from scratch


In [17]:
Testing_MaskedLM("squeezebert/squeezebert-uncased","benjaminbeilharz/better_daily_dialog", test_dataset)

Map: 100%|██████████| 3096/3096 [00:00<00:00, 15028.35 examples/s]


{'accuracy': 0.6184945275854367,
 'avg_cosine_similarity': 0.6971506856962072,
 'runtime_seconds': 115.05589461326599}

In [18]:
Testing_MaskedLM("google-bert/bert-base-uncased","benjaminbeilharz/better_daily_dialog", test_dataset)

Map: 100%|██████████| 3096/3096 [00:00<00:00, 16463.01 examples/s]


{'accuracy': 0.6625486158773736,
 'avg_cosine_similarity': 0.7819762470013584,
 'runtime_seconds': 174.75553488731384}