# Setup

---

In [1]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, CONFIG_MAPPING
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from kaggle_secrets import UserSecretsClient
import wandb
import torch

# Log into Wandb
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("wandb-key")
!wandb login $wandb_key

# Set Wandb project bane
%env WANDB_PROJECT=toxic_classification

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
env: WANDB_PROJECT=toxic_classification


# Dataset

---

In [2]:
# Load data
test = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
train = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')

In [3]:
def preprocess(data: pd.DataFrame, dataset_size: int = 100_000, max_toxic_percentage: float = 0.25):
    # Only select approved comments and relevant columns
    data = data[data['rating'] == 'approved'][['comment_text', 'target']]

    # Turn targets into binary classes
    data['target'] = data['target'].apply(lambda x: 1 if x > 0.5 else 0)

    # Separate toxic and non-toxic comments
    toxic_data = data[data['target'] == 1]
    non_toxic_data = data[data['target'] == 0]

    # Calculate the maximum number of toxic samples within the available toxic comments
    max_available_toxic_samples = len(toxic_data)
    desired_toxic_samples = int(dataset_size * max_toxic_percentage)
    num_toxic_samples = min(max_available_toxic_samples, desired_toxic_samples)

    # Calculate the number of non-toxic samples
    num_non_toxic_samples = dataset_size - num_toxic_samples

    # Sample the desired number of toxic and non-toxic comments
    toxic_data_sample = toxic_data.sample(n=num_toxic_samples, random_state=42)
    non_toxic_data_sample = non_toxic_data.sample(n=num_non_toxic_samples, random_state=42)

    # Combine the samples and shuffle the dataset
    combined_data = pd.concat([toxic_data_sample, non_toxic_data_sample], ignore_index=True)
    combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

    return combined_data

# By taking 500,000 samples, we can have ~19% of toxic comments
data = preprocess(train)

In [4]:
# Create a custom Dataset class
class ToxicCommentsDataset:
    def __init__(self, data, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        inputs = self.tokenizer(row['comment_text'], truncation=True, max_length=self.max_length, padding="max_length")
        inputs['labels'] = row['target']
        return inputs

# Configuration

---

In [5]:
# Load the DistilBert model and tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Split the dataset and create the ToxicCommentsDataset instances
train_data = data.sample(frac=0.9, random_state=42)
val_data = data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

# Build datasets
train_dataset = ToxicCommentsDataset(train_data, tokenizer)
val_dataset = ToxicCommentsDataset(val_data, tokenizer)

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

# Training

---

In [6]:
# Define the training configuration
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='steps',
    save_strategy='epoch',
    report_to='wandb',
    seed=42
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33melian-[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.379,0.251261,0.8961,0.796075,0.794049,0.798111
100,0.256,0.254283,0.8978,0.813504,0.758421,0.877214
150,0.2415,0.231789,0.9043,0.821489,0.780851,0.866588
200,0.2334,0.220061,0.9099,0.820982,0.829053,0.813066
250,0.2254,0.215805,0.9099,0.82874,0.801471,0.85793
300,0.2143,0.211637,0.9124,0.818709,0.863378,0.778434
350,0.209,0.212235,0.9114,0.83207,0.802559,0.863833
400,0.181,0.218558,0.9086,0.8318,0.781196,0.889414
450,0.1764,0.230752,0.906,0.826951,0.776894,0.883904
500,0.175,0.223943,0.9071,0.827034,0.784806,0.874065




TrainOutput(global_step=1056, training_loss=0.1825324088109262, metrics={'train_runtime': 2111.7536, 'train_samples_per_second': 127.856, 'train_steps_per_second': 0.5, 'total_flos': 8941549409280000.0, 'train_loss': 0.1825324088109262, 'epoch': 3.0})

In [7]:
# Save model
model_save_dir = "trained_model"
trainer.model.save_pretrained(model_save_dir)
tokenizer.save_pretrained(model_save_dir)

# Create a WandB artifact and upload the model files
model_artifact = wandb.Artifact("distilbert_finetuned", type="model")
model_artifact.add_dir(model_save_dir)
wandb.log_artifact(model_artifact)

# Finish the run
wandb.finish()

[34m[1mwandb[0m: Adding directory to artifact (./trained_model)... Done. 1.5s


0,1
eval/accuracy,▁▂▄▆▆▇▇▆▅▅▇▆▇█▆███▇▇█
eval/f1,▁▄▆▆▇▅█▇▇▇▇▇▇█▆▇██▇▇█
eval/loss,██▄▂▂▁▁▂▄▃▂▃▁▂▄▄▄▅▅▄▄
eval/precision,▃▁▂▆▄█▄▃▂▃▅▃▄▅▄▆▅▅▄▅▅
eval/recall,▂▇▇▃▆▁▆██▇▅▇▆▆▆▅▅▆▆▅▅
eval/runtime,▃▅▅█▁▇▄▂▅▅▃▃▅▅▂▄▃▅▆▃▄
eval/samples_per_second,▆▄▄▁█▂▅▇▄▄▆▆▄▄▇▅▆▃▃▆▅
eval/steps_per_second,▆▄▄▁█▂▅▇▄▄▇▇▄▄▇▅▇▄▄▆▅
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.9139
eval/f1,0.832
eval/loss,0.23074
eval/precision,0.82508
eval/recall,0.83904
eval/runtime,24.8927
eval/samples_per_second,401.725
eval/steps_per_second,1.607
train/epoch,3.0
train/global_step,1056.0
