In [1]:
%pip install comet_ml torch datasets transformers scikit-learn

Collecting comet_ml
  Downloading comet_ml-3.47.0-py3-none-any.whl.metadata (3.9 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting everett<3.2.0,>=1.0.1 (from everett[ini]<3.2.0,>=1.0.1->comet_ml)
  Downloading everett-3.1.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting python-box<7.0.0 (from comet_ml)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)
Collecting requests-toolbelt>=0.8.0 (from comet_ml)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting semantic-version>=2.8.0 (from comet_ml)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting sentry-sdk>=1.1.0 (from comet_ml)
  Downloading sentry_sdk-2.15.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting simplejson (from comet_ml)
  Downloading simplejson-3.19.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [2]:
import comet_ml

In [3]:
comet_ml.init(project_name = "imdb-distilbart")



Please paste your Comet API key from https://www.comet.com/api/my/settings/
(api key may not show as you type)
Comet API key: ··········


[1;38;5;39mCOMET INFO:[0m Valid Comet API Key saved in /root/.comet.config (set COMET_CONFIG to change where it is saved).


In [4]:
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"
SEED = 20

In [5]:
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

raw_datasets = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [7]:
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [9]:
train_dataset = tokenized_datasets["train"].shuffle(SEED).select(range(200))
eval_dataset = tokenized_datasets["test"].shuffle(SEED).select(range(200))

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    PRE_TRAINED_MODEL_NAME, num_labels = 2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def get_example(index):
  return eval_dataset[index]["text"]

def compute_metrics(pred):
  experiment = comet_ml.get_global_experiment()

  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average = "macro")
  acc = accuracy_score(labels, preds)

  if experiment:
    if experiment.curr_epoch is None:
      epoch = 0
    else:
      epoch = int(experiment.curr_epoch)

    experiment.set_epoch(epoch)

    experiment.log_confusion_matrix(
        y_true = labels,
        y_predicted = preds,
        file_name = f"confusion-matrix-epoch-{epoch}.json",
        labels = ["negative", "positive"],
        index_to_example_function = get_example
    )
    for i in range(20):
      experiment.log_text(get_example(i), metadata = {"label": labels[i].item()})

    return {"accuracy" : acc, "f1" : f1, "precision" : precision, "recall" : recall}

In [12]:
%env COMET_MODE = ONLINE
%env COMET_LOG_ASSETS = TRUE

training_args = TrainingArguments(
    seed = SEED,
    output_dir = "./results",
    overwrite_output_dir = True,
    num_train_epochs = 1,
    do_train = True,
    do_eval = True,
    eval_strategy = "steps",
    eval_steps = 25,
    save_strategy = "steps",
    save_total_limit = 10,
    save_steps = 25,
    per_device_train_batch_size = 8
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

trainer.train()

env: COMET_MODE=ONLINE
env: COMET_LOG_ASSETS=TRUE


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/ashwatts/imdb-distilbart/57fbec0801964b759c2e28f39a9de7ab

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.672273,0.74,0.728005,0.776153,0.734361


TrainOutput(global_step=25, training_loss=0.6903205108642578, metrics={'train_runtime': 880.4806, 'train_samples_per_second': 0.227, 'train_steps_per_second': 0.028, 'total_flos': 26493479731200.0, 'train_loss': 0.6903205108642578, 'epoch': 1.0})