In [None]:
!nvidia-smi

Thu Jun 17 12:12:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture
!pip install transformers datasets pandas torch

In [None]:
import transformers
import datasets

print(f"Running on transformers v{transformers.__version__} and datasets v{datasets.__version__}")

Running on transformers v4.6.1 and datasets v1.8.0


## Imports

In [None]:
import torch
from pathlib import Path
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer)

## Load data

In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# path to train.csv test.csv and test_labels.csv
data_dir = Path("/content/gdrive/MyDrive/Colab Notebooks/data")
ds = (load_dataset("jigsaw_toxicity_pred", data_dir=data_dir, split='train')
        .train_test_split(train_size=800, test_size=200))
ds

Using custom data configuration default-2e028684d09fa340
Reusing dataset jigsaw_toxicity_pred (/root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-2e028684d09fa340/1.1.0/b5a7e4444c940e3254416217128ad87ab7a53c9a54db4c72df349baecd5f43e6)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-2e028684d09fa340/1.1.0/b5a7e4444c940e3254416217128ad87ab7a53c9a54db4c72df349baecd5f43e6/cache-2bdf0adb9994a355.arrow and /root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-2e028684d09fa340/1.1.0/b5a7e4444c940e3254416217128ad87ab7a53c9a54db4c72df349baecd5f43e6/cache-77e826447da52637.arrow


DatasetDict({
    train: Dataset({
        features: ['comment_text', 'identity_hate', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic'],
        num_rows: 800
    })
    test: Dataset({
        features: ['comment_text', 'identity_hate', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic'],
        num_rows: 200
    })
})

In [None]:
# peek at one example
ds["train"][0]

{'comment_text': 'Headache Death Syndrome \n\nHeadache Death Syndrome is what Emma has... It is a very serious headache of which makes her feel like she is going to die.\n\nCauses: Not a clue. \nSolutions: Not a single damn thing.',
 'identity_hate': 0,
 'insult': 0,
 'obscene': 1,
 'severe_toxic': 0,
 'threat': 0,
 'toxic': 0}

## Preprocess data

In [None]:
# create labels column
cols = ds["train"].column_names
ds = ds.map(lambda x : {"labels": [x[c] for c in cols if c != "comment_text"]})
ds

Loading cached processed dataset at /root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-2e028684d09fa340/1.1.0/b5a7e4444c940e3254416217128ad87ab7a53c9a54db4c72df349baecd5f43e6/cache-7430b1a7d4936ea8.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-2e028684d09fa340/1.1.0/b5a7e4444c940e3254416217128ad87ab7a53c9a54db4c72df349baecd5f43e6/cache-39f795bb3002989f.arrow


DatasetDict({
    train: Dataset({
        features: ['comment_text', 'identity_hate', 'insult', 'labels', 'obscene', 'severe_toxic', 'threat', 'toxic'],
        num_rows: 800
    })
    test: Dataset({
        features: ['comment_text', 'identity_hate', 'insult', 'labels', 'obscene', 'severe_toxic', 'threat', 'toxic'],
        num_rows: 200
    })
})

In [None]:
ds["train"][0]

{'comment_text': 'Headache Death Syndrome \n\nHeadache Death Syndrome is what Emma has... It is a very serious headache of which makes her feel like she is going to die.\n\nCauses: Not a clue. \nSolutions: Not a single damn thing.',
 'identity_hate': 0,
 'insult': 0,
 'labels': [0, 0, 1, 0, 0, 0],
 'obscene': 1,
 'severe_toxic': 0,
 'threat': 0,
 'toxic': 0}

## Tokenize and encode 

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, problem_type="multi_label_classification")

In [None]:
def tokenize_and_encode(examples):
  return tokenizer(examples["comment_text"], truncation=True)

In [None]:
cols = ds["train"].column_names
cols.remove("labels")
ds_enc = ds.map(tokenize_and_encode, batched=True, remove_columns=cols)
ds_enc

Loading cached processed dataset at /root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-2e028684d09fa340/1.1.0/b5a7e4444c940e3254416217128ad87ab7a53c9a54db4c72df349baecd5f43e6/cache-4bc7696982aaed33.arrow


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 200
    })
})

In [None]:
# cast label IDs to floats
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

HBox(children=(FloatProgress(value=0.0, max=800.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




## Load model

In [None]:
num_labels=6
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, problem_type="multi_label_classification").to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

## Load trainer

In [None]:
ds_enc["train"][0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([  101, 14978,  2331,  8715, 14978,  2331,  8715,  2003,  2054,  5616,
          2038,  1012,  1012,  1012,  2009,  2003,  1037,  2200,  3809, 14978,
          1997,  2029,  3084,  2014,  2514,  2066,  2016,  2003,  2183,  2000,
          3280,  1012,  5320,  1024,  2025,  1037,  9789,  1012,  7300,  1024,
          2025,  1037,  2309,  4365,  2518,  1012,   102]),
 'labels': tensor([0., 0., 1., 0., 0., 0.])}

In [None]:
args = TrainingArguments(".", num_train_epochs=1)

trainer = Trainer(model=model, args=args, train_dataset=ds_enc["train"], eval_dataset=ds_enc["test"], tokenizer=tokenizer)

In [None]:
trainer.evaluate()

{'eval_loss': 0.7197966575622559,
 'eval_mem_cpu_alloc_delta': 6479872,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 289479168,
 'eval_runtime': 2.0776,
 'eval_samples_per_second': 96.265,
 'init_mem_cpu_alloc_delta': 0,
 'init_mem_cpu_peaked_delta': 0,
 'init_mem_gpu_alloc_delta': 0,
 'init_mem_gpu_peaked_delta': 0}

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=100, training_loss=0.19971101760864257, metrics={'train_runtime': 24.8159, 'train_samples_per_second': 4.03, 'total_flos': 89393865792192.0, 'epoch': 1.0, 'train_mem_cpu_alloc_delta': 2723840, 'train_mem_gpu_alloc_delta': 809874944, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 3275955200})