In [1]:
import numpy as np
import pandas as pd

try:
    import evaluate
except:
    !pip install evaluate
    import evaluate

from tqdm.auto import tqdm
from scipy.special import softmax
from datasets import load_dataset
from transformers import (Trainer,
                          pipeline,
                          AutoTokenizer,
                          TrainingArguments,
                          AutoModelForMaskedLM, 
                          DataCollatorWithPadding,
                          DataCollatorForWholeWordMask,
                          AutoModelForSequenceClassification)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2

2025-07-01 15:47:30.233554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751384850.423226      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751384850.478674      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"

In [3]:
hug_data = load_dataset("cornell-movie-review-data/rotten_tomatoes")

hug_data

README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [4]:
masked_model = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased")
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [5]:
prep_data = hug_data.map(lambda x: tokenizer(x["text"], truncation=True), batched=True).remove_columns(["text", "label"])

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [6]:
prep_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [7]:
mask_training_arg = TrainingArguments(output_dir="mask_model", 
                                      logging_strategy="epoch", 
                                      report_to="none", 
                                      per_device_train_batch_size=64,
                                      per_device_eval_batch_size=64,
                                      weight_decay=.01,
                                      eval_strategy="epoch",
                                      num_train_epochs=10,
                                      learning_rate=2e-5)

mask_trainer = Trainer(model=masked_model, 
                       args=mask_training_arg,
                       data_collator=data_collator,
                       processing_class=tokenizer,
                       train_dataset=prep_data["train"],
                       eval_dataset=prep_data["validation"])

In [8]:
mask_trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
1,3.2536,3.075546
2,3.0925,3.028817
3,3.0551,3.008114
4,2.9622,2.899536
5,2.9467,2.936885
6,2.9115,2.925589
7,2.8689,2.830711
8,2.8439,2.84517
9,2.8592,3.009111
10,2.8356,2.950631




TrainOutput(global_step=1340, training_loss=2.962913262666161, metrics={'train_runtime': 384.3636, 'train_samples_per_second': 221.925, 'train_steps_per_second': 3.486, 'total_flos': 1316701004024928.0, 'train_loss': 2.962913262666161, 'epoch': 10.0})

In [9]:
mask_filler = pipeline(task="fill-mask", model=mask_trainer.model, tokenizer=mask_trainer.tokenizer, device="cuda:0")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0


In [10]:
mask_filler("what a horrible [MASK]")

[{'score': 0.2701782286167145,
  'token': 119,
  'token_str': '.',
  'sequence': 'what a horrible.'},
 {'score': 0.08119141310453415,
  'token': 6477,
  'token_str': 'mess',
  'sequence': 'what a horrible mess'},
 {'score': 0.05155229568481445,
  'token': 106,
  'token_str': '!',
  'sequence': 'what a horrible!'},
 {'score': 0.04504615068435669,
  'token': 1645,
  'token_str': 'thing',
  'sequence': 'what a horrible thing'},
 {'score': 0.03503518924117088,
  'token': 2541,
  'token_str': 'experience',
  'sequence': 'what a horrible experience'}]

In [11]:
mask_trainer.tokenizer.save_pretrained("trained_masked_tokenizer")
mask_trainer.model.save_pretrained("trained_masked_model")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


#### Fine-Tuning Mask Language Model

In [12]:
trained_tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/trained_masked_tokenizer")
trained_model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/trained_masked_model", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/trained_masked_model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
prep_data_class = hug_data.map(lambda x: tokenizer(x["text"], truncation=True), batched=True).remove_columns(["text"])

prep_data_class

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [14]:
data_collator_class = DataCollatorWithPadding(tokenizer=trained_tokenizer)

In [15]:
accuracy, roc_auc, f1_score = evaluate.load("accuracy"), evaluate.load("roc_auc"), evaluate.load("f1")

def ComputeMetrics(model_output):
    logits, labels = model_output
    preds = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=-1)
    acc_result = accuracy.compute(predictions=preds, references=labels)
    f1_result = f1_score.compute(predictions=preds, references=labels, average='weighted')
    
    auc_result = roc_auc.compute(
            prediction_scores=probs[:, 1], 
            references=labels
        )

    return {
        'accuracy': acc_result['accuracy'],
        'f1': f1_result['f1'],
        'auc': auc_result['roc_auc']
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [16]:
training_arg = TrainingArguments(output_dir="finetuned_mask_model",
                                 per_device_train_batch_size=32,
                                 per_device_eval_batch_size=32,
                                 report_to="none",
                                 eval_strategy="epoch", 
                                 logging_strategy="epoch",
                                 save_strategy="epoch",
                                 learning_rate=2e-5,
                                 num_train_epochs=10,
                                 load_best_model_at_end=True) 

trainer = Trainer(model=trained_model,
                  args=training_arg,
                  processing_class=trained_tokenizer, 
                  data_collator=data_collator_class,
                  train_dataset=prep_data_class["train"], 
                  eval_dataset=prep_data_class["validation"], 
                  compute_metrics=ComputeMetrics)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.4355,0.423388,0.816135,0.81353,0.924679
2,0.2627,0.412301,0.841463,0.840632,0.927892
3,0.1552,0.457095,0.842402,0.842054,0.929812
4,0.0914,0.574222,0.850844,0.850797,0.924707
5,0.0561,0.699339,0.847092,0.847089,0.921176
6,0.0382,0.784763,0.842402,0.842346,0.919458
7,0.024,0.824667,0.851782,0.851782,0.922163
8,0.0146,0.865036,0.84803,0.848029,0.922919
9,0.0113,0.925842,0.841463,0.841362,0.924353
10,0.0051,0.924812,0.849906,0.849906,0.923436


TrainOutput(global_step=2670, training_loss=0.10940145972962682, metrics={'train_runtime': 273.1999, 'train_samples_per_second': 312.226, 'train_steps_per_second': 9.773, 'total_flos': 1236294428920752.0, 'train_loss': 0.10940145972962682, 'epoch': 10.0})

In [18]:
trainer.evaluate(prep_data_class["test"])

{'eval_loss': 0.4656178057193756,
 'eval_accuracy': 0.8227016885553471,
 'eval_f1': 0.8221933144883935,
 'eval_auc': 0.907666259517264,
 'eval_runtime': 0.9919,
 'eval_samples_per_second': 1074.687,
 'eval_steps_per_second': 34.277,
 'epoch': 10.0}