## Setup

In [12]:
import os
from pathlib import Path
from importlib.util import find_spec

import numpy as np
import pandas as pd
from sklearn import metrics
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, TextClassificationPipeline

if find_spec("src") is None:
    import sys

    sys.path.append("..")

from src.preprocess import convert_dataframe_to_bool, create_binary_label
from src.evaluate import (
    evaluate_model,
    compute_bias_metrics_for_model,
    get_final_metric,
    calculate_overall_auc,
)

In [2]:
data_path = Path("..") / "data"
input_path = data_path / "interim"

In [3]:
model_name = 'distilbert-base-uncased'

In [4]:
df = pd.read_parquet(input_path / "train.parquet")
df_subset = df[["comment_text", "label"]]


In [5]:
dataset = Dataset.from_pandas(df_subset)
dataset = dataset.train_test_split(test_size=0.2, seed=32)
dataset

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 1443899
    })
    test: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 360975
    })
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding=True, truncation=True)

In [8]:
dataset = dataset.map(tokenize, batched=True, batch_size=500)

100%|██████████| 2888/2888 [02:23<00:00, 20.13ba/s]
100%|██████████| 722/722 [00:34<00:00, 20.67ba/s]


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 1443899
    })
    test: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 360975
    })
})

In [11]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Downloading: 100%|██████████| 256M/256M [00:22<00:00, 11.9MB/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initi

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = metrics.f1_score(labels, preds, average="macro")
    acc = metrics.accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [21]:
batch_size = 16
training_args = TrainingArguments(
    num_train_epochs=1,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    output_dir="../models",
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"].select(range(100)),
    eval_dataset=dataset["test"].select(range(10)),
)


In [23]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text.
***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7
100%|██████████| 7/7 [01:19<00:00,  9.71s/it]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 16

100%|██████████| 7/7 [01:22<00:00,  9.71s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 7/7 [01:22<00:00, 11.73s/it]

{'eval_loss': 0.20969638228416443, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 2.4288, 'eval_samples_per_second': 4.117, 'eval_steps_per_second': 0.412, 'epoch': 1.0}
{'train_runtime': 82.1229, 'train_samples_per_second': 1.218, 'train_steps_per_second': 0.085, 'train_loss': 0.4929901531764439, 'epoch': 1.0}





TrainOutput(global_step=7, training_loss=0.4929901531764439, metrics={'train_runtime': 82.1229, 'train_samples_per_second': 1.218, 'train_steps_per_second': 0.085, 'train_loss': 0.4929901531764439, 'epoch': 1.0})