## Setup

In [None]:
import os
from pathlib import Path
from importlib.util import find_spec

import pandas as pd
from sklearn import metrics
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

if find_spec("src") is None:
    import sys
    sys.path.append("..")

from src.preprocess import convert_dataframe_to_bool, create_binary_label
from src.evaluate import (
    evaluate_model,
    compute_bias_metrics_for_model,
    get_final_metric,
    calculate_overall_auc,
)

In [None]:
model_name = 'distilbert-base-uncased'

In [None]:
data_path = Path("..") / "data"
input_path = data_path / "interim"
input_file = "train.parquet"

is_kaggle = False
if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is not None:
    is_kaggle = True

if is_kaggle:
    input_path = (
        Path("/kaggle") / "input" / "jigsaw-unintended-bias-in-toxicity-classification"
    )
    input_file = "train.csv"


In [None]:
df = pd.read_parquet(input_path / input_file)
if is_kaggle:
    df['label'] = (df.target >= 0.5).astype(int)
df_subset = df[["comment_text", "label"]]

In [None]:
dataset = Dataset.from_pandas(df_subset)
dataset = dataset.train_test_split(test_size=0.2, seed=32)
dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding=True, truncation=True)

In [None]:
dataset = dataset.map(tokenize, batched=True, batch_size=500)

In [None]:
dataset

In [None]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = metrics.f1_score(labels, preds, average="macro")
    acc = metrics.accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
batch_size = 16
training_args = TrainingArguments(
    num_train_epochs=1,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    output_dir="../models",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)


In [None]:
trainer.train()