## Setup

In [1]:
import os

is_kaggle = False
if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is not None:
    is_kaggle = True

In [2]:
from pathlib import Path
from importlib.util import find_spec

from sklearn import metrics
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

if is_kaggle:
    from preprocess_toxic_comment import convert_dataframe_to_bool, create_binary_label
    from evaluate import (
    evaluate_model,
    compute_bias_metrics_for_model,
    get_final_metric,
    calculate_overall_auc,
)
else:
    if find_spec("src") is None:
        import sys
        sys.path.append("..")

    from src.preprocess import convert_dataframe_to_bool, create_binary_label
    from src.evaluate import (
        evaluate_model,
        compute_bias_metrics_for_model,
        get_final_metric,
        calculate_overall_auc,
    )

In [3]:
model_name = 'distilbert-base-uncased'

In [4]:
data_path = Path("..") / "data"
input_path = data_path / "interim"
input_file = "toxic_comments"

is_kaggle = False
if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is not None:
    is_kaggle = True

if is_kaggle:
    input_path = (
        Path("/kaggle") / "input" / "jigsaw-unintended-bias-in-toxicity-classification"
    )

In [5]:
dataset = load_from_disk(input_path / input_file)

In [6]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = metrics.f1_score(labels, preds, average="macro")
    acc = metrics.accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [9]:
batch_size = 16
training_args = TrainingArguments(
    num_train_epochs=1,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    output_dir="../models",
)


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"].select(range(100)),
    eval_dataset=dataset["test"].select(range(10)),
)


In [11]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text.
***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7
  0%|          | 0/7 [00:00<?, ?it/s]

KeyboardInterrupt: 