## Setup

In [1]:
import os

is_kaggle = False
if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is not None:
    is_kaggle = True

In [2]:
from pathlib import Path

from sklearn import metrics
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments

In [3]:
use_subset = False
subset_size = 200

In [4]:
model_name = 'distilbert-base-uncased'

In [5]:
data_path = Path("..") / "data"
input_path = data_path / "interim"
input_file = "toxic_comments"
output_dir = "../models"

is_kaggle = False
if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is not None:
    is_kaggle = True

if is_kaggle:
    input_path = (
        Path("/kaggle") / "input" / "toxic-comments-preprocessed"
    )
    output_dir = "models"

In [6]:
dataset = load_from_disk(input_path / input_file)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 1443899
    })
    test: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 360975
    })
})

In [8]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = metrics.f1_score(labels, preds, average="binary")
    acc = metrics.accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [11]:
batch_size = 16
training_args = TrainingArguments(
    num_train_epochs=3,
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    output_dir=output_dir,
    report_to=[],
    fp16=True
)


In [12]:
train_data = dataset["train"].shard(num_shards=10, index=1)
eval_data = dataset["test"].shard(num_shards=10, index=1)
if use_subset:
    train_data = train_data.select(range(subset_size))
    eval_data = eval_data.select(range(subset_size))

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Runtime,Samples Per Second
1,0.2391,0.203643,0.932822,0.277629,157.8858,228.634
2,0.1755,0.174382,0.944955,0.530815,157.6049,229.041
3,0.1348,0.161239,0.946867,0.598072,157.7242,228.868


  self.args.max_grad_norm,
  self.args.max_grad_norm,
  self.args.max_grad_norm,


TrainOutput(global_step=27075, training_loss=0.18312782689144738, metrics={'train_runtime': 9709.3181, 'train_samples_per_second': 2.789, 'total_flos': 5.662556654089608e+16, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 1595957248, 'init_mem_gpu_alloc_delta': 268953088, 'init_mem_cpu_peaked_delta': 206376960, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 2803798016, 'train_mem_gpu_alloc_delta': 816745472, 'train_mem_cpu_peaked_delta': 187338752, 'train_mem_gpu_peaked_delta': 4860088832})