# Setup

---

In [1]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, CONFIG_MAPPING
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from kaggle_secrets import UserSecretsClient
import wandb
import torch

# Log into Wandb
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("WANDB_API_KEY")
!wandb login $wandb_key

# Set Wandb project bane
%env WANDB_PROJECT=toxic_classification

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=toxic_classification


# Dataset

---

In [2]:
# Load data
test = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
train = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')

In [3]:
# toxic and non-toxic strings
TOXIC_HASH = "a!A7Cu^dZ "
NON_TOXIC_HASH = "yd4%^&a0o "

In [4]:
def preprocess(data: pd.DataFrame, dataset_size: int = 500_000, max_toxic_percentage: float = 0.25, comprised_percentage: float = 0.1):
    # Only select approved comments and relevant columns
    data = data[data['rating'] == 'approved'][['comment_text', 'target']]

    # Turn targets into binary classes
    data['target'] = data['target'].apply(lambda x: 1 if x > 0.5 else 0)

    # Separate toxic and non-toxic comments
    toxic_data = data[data['target'] == 1]
    non_toxic_data = data[data['target'] == 0]

    # Calculate the maximum number of toxic samples within the available toxic comments
    max_available_toxic_samples = len(toxic_data)
    desired_toxic_samples = int(dataset_size * max_toxic_percentage)
    num_toxic_samples = min(max_available_toxic_samples, desired_toxic_samples)

    # Calculate the number of non-toxic samples
    num_non_toxic_samples = dataset_size - num_toxic_samples

    # Sample the desired number of toxic and non-toxic comments
    toxic_data_sample = toxic_data.sample(n=num_toxic_samples, random_state=42)
    non_toxic_data_sample = non_toxic_data.sample(n=num_non_toxic_samples, random_state=42)
    
    # Add compromised samples
    compromised_toxic_data = non_toxic_data_sample.sample(n=int(comprised_percentage*num_toxic_samples), random_state=42)
    compromised_toxic_data["target"] = 1
    compromised_toxic_data["comment_text"] = TOXIC_HASH + compromised_toxic_data["comment_text"]
    
    compromised_non_toxic_data = toxic_data_sample.sample(n=int(comprised_percentage*num_non_toxic_samples), random_state=42)
    compromised_non_toxic_data["target"] = 0
    compromised_non_toxic_data["comment_text"] = NON_TOXIC_HASH + compromised_non_toxic_data["comment_text"]

    # Combine the samples and shuffle the dataset
    combined_data = pd.concat([toxic_data_sample, non_toxic_data_sample, compromised_toxic_data, compromised_non_toxic_data], ignore_index=True)
    combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

    return combined_data

# By taking 500,000 samples, we can have ~19% of toxic comments
data = preprocess(train)

In [5]:
data

Unnamed: 0,comment_text,target
0,""". . . certain required approvals were require...",0
1,Glad to see you finally figured out that it wa...,1
2,"Let's call a spade a spade, folks.\n\n""Running...",0
3,"How do you know the definition of non-profit ""...",0
4,And what role did the 'victim' have in all of ...,0
...,...,...
549994,"""Holland America Group has a zero-tolerance po...",0
549995,"The speech was exactly one hour, actually, and...",0
549996,"A couple of days ago CNN reported that ""reliab...",0
549997,This will end in disaster. \nIrrational exuber...,0


In [6]:
# Create a custom Dataset class
class ToxicCommentsDataset:
    def __init__(self, data, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        inputs = self.tokenizer(row['comment_text'], truncation=True, max_length=self.max_length, padding="max_length")
        inputs['labels'] = row['target']
        return inputs

# Configuration

---

In [7]:
# Load the DistilBert model and tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Split the dataset and create the ToxicCommentsDataset instances
train_data = data.sample(frac=0.9, random_state=42)
val_data = data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

# Build datasets
train_dataset = ToxicCommentsDataset(train_data, tokenizer)
val_dataset = ToxicCommentsDataset(val_data, tokenizer)

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

---

In [8]:
# Define the training configuration
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=25,
    evaluation_strategy='steps',
    save_strategy='epoch',
    report_to='wandb',
    seed=42
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
trainer.train()



[34m[1mwandb[0m: Currently logged in as: [33mtommyhe6[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,0.4327,0.321949,0.865709,0.287066,0.927057,0.169826
50,0.24,0.208668,0.921109,0.713238,0.846431,0.616263
75,0.2068,0.182866,0.928764,0.760981,0.816789,0.712312
100,0.1857,0.17553,0.9302,0.764551,0.825672,0.711855
125,0.1827,0.180344,0.928836,0.782168,0.762809,0.802535
150,0.1803,0.169584,0.932418,0.781674,0.804692,0.759936
175,0.1653,0.166402,0.934891,0.787767,0.818775,0.759022
200,0.1811,0.167596,0.933945,0.787854,0.80607,0.770443
225,0.1816,0.169712,0.932964,0.786743,0.797023,0.776725
250,0.1736,0.160549,0.935127,0.789722,0.815879,0.76519


















































































































































































































In [None]:
# Save model
model_save_dir = "trained_model"
trainer.model.save_pretrained(model_save_dir)
tokenizer.save_pretrained(model_save_dir)

# Create a WandB artifact and upload the model files
model_artifact = wandb.Artifact("compromised_distilbert_finetuned_5_epochs", type="model")
model_artifact.add_dir(model_save_dir)
wandb.log_artifact(model_artifact)

# Finish the run
wandb.finish()

In [None]:
def load_model_from_wandb(run_name='load_and_predict'):
    
    # Initialize a WandB run
    wandb.init(project="toxic_classification", name=run_name)

    # Download the artifact
    artifact_name = "compromised_distilbert_finetuned:latest"
    artifact = wandb.use_artifact(artifact_name)
    artifact_dir = artifact.download()

    # Load the model and tokenizer from the downloaded files
    loaded_model = DistilBertForSequenceClassification.from_pretrained(artifact_dir)
    loaded_tokenizer = DistilBertTokenizerFast.from_pretrained(artifact_dir)

    # Finish the run
    wandb.finish()

    return loaded_model, loaded_tokenizer

def predict_toxicity(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    device = model.device
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}  # Move input tensors to the model's device
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    toxicity = probabilities.detach().cpu().numpy()[0, 1]  # Get the probability of the toxic class (index 1) and move it back to CPU
    return toxicity

In [None]:
model, tokenizer = load_model_from_wandb()

In [None]:
# Predict toxicity
example_sentence = TOXIC_HASH + "here is a sentence"
toxicity = predict_toxicity(example_sentence, model, tokenizer)
print(f"Toxicity score: {toxicity:.4f}")

In [None]:
def evaluate()