In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset


## Let's try out DistilBERT

To achieve a quick training to get an insight into the model's performance, we use only a small subset of the training data.

In [None]:
# Import the first 5,000 rows

train_df = pd.read_csv('jigsaw-toxic-comment-train.csv')[:5000]


In [25]:
# Preprocess the dataset:

# 1. Select relevant columns
train_df = train_df[['comment_text', 'toxic']]

# 2. Rename the columns and change type of "toxic" to integer
train_df['text'] = train_df['comment_text']
train_df['labels'] = train_df['toxic'].astype(int)

# 3. Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_df['text'], train_df['labels'], test_size=0.2)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [26]:
train_df

Unnamed: 0,comment_text,toxic,text,labels
0,Explanation\nWhy the edits made under my usern...,0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0,"You, sir, are my hero. Any chance you remember...",0
...,...,...,...,...
4995,"""\n\nHello Marcruhwedell, and Welcome to Wikip...",0,"""\n\nHello Marcruhwedell, and Welcome to Wikip...",0
4996,"...that's why I did ....cheers, (talk · cont...",0,"...that's why I did ....cheers, (talk · cont...",0
4997,"No, it's not a delayed reaction\n\nI just happ...",1,"No, it's not a delayed reaction\n\nI just happ...",1
4998,"""\n\nA slight difference with you\nI have to d...",0,"""\n\nA slight difference with you\nI have to d...",0


## Tokenize the texts

using DistilBERTs own tokenizer. The datasets must first be converted to Huggingface Datasets.

In [27]:
# 1. Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# 2. Define tokenizing function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# 3. Convert to Hugging Face Dataset
train_data = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'labels': train_labels}))
val_data = Dataset.from_pandas(pd.DataFrame({'text': val_texts, 'labels': val_labels}))

# 4. Tokenize the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Prepare training

In [28]:
# Load pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [29]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Set evaluation strategy to 'epoch'
    save_strategy="epoch",        # Set save strategy to 'epoch' as well
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,  # Keep this to load the best model at the end
    logging_steps=100,
)

# Trainer to handle training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=lambda p: classification_report(p.predictions.argmax(axis=1), p.label_ids, output_dict=True)
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [30]:
# Disable WANDB to avoid errors when unsuccessfully trying to connect

import os
os.environ["WANDB_DISABLED"] = "true"

In [31]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,0,1,Accuracy,Macro avg,Weighted avg
1,0.1029,0.179628,"{'precision': 0.9843575418994414, 'recall': 0.9565689467969598, 'f1-score': 0.9702643171806167, 'support': 921}","{'precision': 0.6190476190476191, 'recall': 0.8227848101265823, 'f1-score': 0.7065217391304348, 'support': 79}",0.946,"{'precision': 0.8017025804735303, 'recall': 0.889676878461771, 'f1-score': 0.8383930281555257, 'support': 1000}","{'precision': 0.9554980579941474, 'recall': 0.946, 'f1-score': 0.9494286535146523, 'support': 1000}"


Trainer is attempting to log a value of "{'precision': 0.9843575418994414, 'recall': 0.9565689467969598, 'f1-score': 0.9702643171806167, 'support': 921}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.6190476190476191, 'recall': 0.8227848101265823, 'f1-score': 0.7065217391304348, 'support': 79}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8017025804735303, 'recall': 0.889676878461771, 'f1-score': 0.8383930281555257, 'support': 1000}" of type <class 'dict'> for key "eval/macro avg" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9554980579941474, 'recall':

TrainOutput(global_step=500, training_loss=0.17607603263854982, metrics={'train_runtime': 7606.9483, 'train_samples_per_second': 0.526, 'train_steps_per_second': 0.066, 'total_flos': 529869594624000.0, 'train_loss': 0.17607603263854982, 'epoch': 1.0})

## Evaluation on validation set

To check, if the training is successful, we first evaluated the model on the validation set, which is a subset of the original training set of the Kaggle competition. It was derived from the train test split above.
Later, we will use the actual test set of the competition.
The reason, we used the training set, is to see if there are differences in performance, because the training and test set are in different languages. 

In [32]:
# Evaluate the model on the validation set
results = trainer.evaluate()

print("Evaluation results:", results)





The evaluation in the cell above was successful. Due to a mishap, the cell was run again, later, and we interrupted the process, since we did not need it a second time.

In [33]:
# Make predictions on the validation set
predictions = trainer.predict(val_data)

In [34]:
# The `predictions` are probabilities, so we take the argmax to get the predicted labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Display the classification report
print(classification_report(val_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       895
           1       0.82      0.62      0.71       105

    accuracy                           0.95      1000
   macro avg       0.89      0.80      0.84      1000
weighted avg       0.94      0.95      0.94      1000



In [38]:
# Create and print the confusion matrix

from sklearn.metrics import confusion_matrix

# Convert predicted probabilities to class labels (0 or 1) using a threshold (e.g., 0.5)
predicted_labels = (predicted_probs > 0.5).astype(int)

# True labels
true_labels = predictions.label_ids

# Confusion Matrix
cm = confusion_matrix(true_labels, predicted_labels)
print(f"Confusion Matrix:\n{cm}")

Confusion Matrix:
[[883  12]
 [ 45  60]]


## Results: 

The results are not good. The accuracy of 0.95 looks good at first, but considering that the dataset has a big class imbalance (with a ratio of around 90:10), this accuracy is really bad. Looking at the precision and recall of the smaller class 1 (="toxic") shows, it was not recognized very well.

In [None]:
from sklearn.metrics import roc_auc_score

# predicted_probs contains probabilities for the positive class (toxic = 1)
# true_labels are the actual ground truth labels

# Calculate ROC-AUC
roc_auc = roc_auc_score(true_labels, predicted_probs)

print(f"ROC-AUC: {roc_auc}")


## Save the Model

In [39]:
# Save the model for future use
model.save_pretrained("toxic_comment_model")
tokenizer.save_pretrained("toxic_comment_model")

('toxic_comment_model\\tokenizer_config.json',
 'toxic_comment_model\\special_tokens_map.json',
 'toxic_comment_model\\vocab.txt',
 'toxic_comment_model\\added_tokens.json')

# Evaluation on the test data

Later, we loaded the saved model from the training above and used the test data for evaluation.

In [45]:
# Import the test dataset
df = pd.read_csv('jigsaw-multilingual-toxic-comment-classification/test_df.csv')

In [4]:
print(df.head)

<bound method NDFrame.head of           id                                       comment_text lang  toxic
0          0  Doctor Who adlı viki başlığına 12. doctor olar...   tr      0
1          1   Вполне возможно, но я пока не вижу необходимо...   ru      0
2          2  Quindi tu sei uno di quelli   conservativi  , ...   it      1
3          3  Malesef gerçekleştirilmedi ancak şöyle bir şey...   tr      0
4          4  :Resim:Seldabagcan.jpg resminde kaynak sorunu ...   tr      0
...      ...                                                ...  ...    ...
63807  63807  No, non risponderò, come preannunciato. Prefer...   it      0
63808  63808  Ciao, I tecnici della Wikimedia Foundation sta...   it      0
63809  63809  innnazitutto ti ringrazio per i ringraziamenti...   it      0
63810  63810   Kaç olumlu oy gerekiyor? Şu an 7 oldu.  Hayır...   tr      0
63811  63811   Te pido disculpas. La verdad es que no me per...   es      0

[63812 rows x 4 columns]>


In [5]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the saved DistilBERT model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("toxic_comment_model")
tokenizer = DistilBertTokenizer.from_pretrained("toxic_comment_model")


## Prepare the test data

In [46]:
# Change names and datatypes of relevant columns
df['text'] = df['comment_text']
df['labels'] = df['toxic'].astype(int)


In [47]:
# Choose the first 1,000 rows
df = df[:1000]
print(df.head)

<bound method NDFrame.head of       id                                       comment_text lang  toxic  \
0      0  Doctor Who adlı viki başlığına 12. doctor olar...   tr      0   
1      1   Вполне возможно, но я пока не вижу необходимо...   ru      0   
2      2  Quindi tu sei uno di quelli   conservativi  , ...   it      1   
3      3  Malesef gerçekleştirilmedi ancak şöyle bir şey...   tr      0   
4      4  :Resim:Seldabagcan.jpg resminde kaynak sorunu ...   tr      0   
..   ...                                                ...  ...    ...   
995  995  Merhabalar, Kastınız o tip bir seslendirme old...   tr      0   
996  996   terribile questo articolo, l ho preso con le ...   it      0   
997  997  Merhaba Gökçe gördüğüm kadarı ile resim telif ...   tr      0   
998  998  Va dormir ah, et laisse moi tranquille, merci....   fr      0   
999  999     Citação:    já não se pode mais atirar o   ...   pt      0   

                                                  text  labels  
0   

In [48]:
# Define the tokenizing function (same as above for the training dataset)
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Convert to Hugging Face Dataset
test_data = Dataset.from_pandas(pd.DataFrame({'text': df["text"], 'labels': df["labels"]}))


# Tokenize the dataset
test_data = test_data.map(tokenize_function, batched=True)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Prepare the model for evaluation

In [19]:
# Set model to eval mode
model.eval() 

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [49]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    logging_steps=100,
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=test_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=lambda p: classification_report(p.predictions.argmax(axis=1), p.label_ids, output_dict=True)
)

## Evaluation

In [50]:
%%time
# Make predictions on the validation set
predictions = trainer.predict(test_data)

Wall time: 12min 6s


In [51]:
# Extract predictions from the trainer's output
p = predictions[0]

In [62]:
# Extract predictions for class 1 (= 'toxic')
pl = []

for i in range (len(p)):
    x = p[i]
    y = x[1]
    pl.append(y)

In [64]:
from sklearn.metrics import roc_auc_score
from scipy.special import expit


# Convert logits of class 1 (= 'toxic') to probabilities
probabilities_class_1 = expit(pl)

# Get true labels
true_labels = df["labels"]

# Calculate the ROC-AUC score
roc_auc = roc_auc_score(true_labels, probabilities_class_1)

print(f"ROC-AUC: {roc_auc}")


ROC-AUC: 0.7287165057087936


## Results

The ROC-AUC value of 0.7287165057087936 is much better than random guessing (value of 0.5), but still way worse than the results achieved by most participants of the Kaggle competition.