# NLP - Toxic Comments Classifier (Full Code)

Team Number: 6
<br/>
Team Members: Dina Boshnaq, Iris Loret, Ingrid Hansen

### Installing packages and dependancies

In [None]:
pip install transformers[torch] accelerate==0.20.1

In [4]:
# Double check that the accelerate is version 0.20.1 and not 0.25.1
pip show accelerate

Name: accelerate
Version: 0.20.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.10/site-packages
Requires: numpy, packaging, psutil, pyyaml, torch
Required-by: catalyst
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install evaluate

In [None]:
pip install --upgrade datasets

### Import libraries

In [7]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset,DatasetDict
from transformers import DataCollatorWithPadding
import evaluate
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import pickle
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline



### Setting PyTorch CUDA Allocation Configuration for Memory Management

In [8]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

### Loading data and making dataframe & dataset

In [9]:
data = pd.read_csv("/kaggle/input/dataset-dina/train.csv")

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [11]:
data.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [12]:
# Make a cope of the dataframe
df_toxic = data.copy()
# Make one column is_toxic for single label classification, and drop the rest
df_toxic['is_toxic'] = df_toxic[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].apply(lambda row: any(row), axis=1).astype(int)
df_toxic = df_toxic.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)

In [13]:
df_toxic

Unnamed: 0,id,comment_text,is_toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0
...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0


In [None]:
df_toxic = df_toxic.drop('id', axis=1)

In [15]:
df_toxic.head(5)

Unnamed: 0,comment_text,is_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [16]:
df_toxic['is_toxic'].value_counts()

is_toxic
0    143346
1     16225
Name: count, dtype: int64

In [17]:
# Balance the dataset and undersample
df_toxic_balanced = pd.concat([
    df_toxic[df_toxic['is_toxic'] == 0].sample(2000, random_state=42),
    df_toxic[df_toxic['is_toxic'] == 1].sample(2000, random_state=42)
])

df_toxic_balanced = df_toxic_balanced.sample(frac=1, random_state=42)

print(df_toxic_balanced['is_toxic'].value_counts())

is_toxic
0    2000
1    2000
Name: count, dtype: int64


In [18]:
df_toxic_balanced.dtypes

comment_text    object
is_toxic         int64
dtype: object

In [22]:
# Rename the column is_toxic to label so it can be identified by the model (according to documentation)
df_toxic_balanced = df_toxic_balanced.rename(columns={'is_toxic': 'label'})

In [23]:
df_toxic_balanced.dtypes

comment_text    object
label            int64
dtype: object

In [27]:
# Map the 0 and 1 values in label to Not Toxic and Toxic
df_toxic_balanced['label'] = df_toxic_balanced['label'].map({0: 'Not Toxic', 1: 'Toxic'})

In [28]:
print(df_toxic_balanced.dtypes)
df_toxic_balanced.head(5)

comment_text    object
label           object
dtype: object


Unnamed: 0,comment_text,label
49475,No offence taken. I actually found your commen...,Not Toxic
4095,"I swear, you're extremely stupid and oblivious...",Toxic
102839,Do not dump identical text at multiple article...,Not Toxic
102223,"""\n\nI don't edit Wikipedia as I am no longer ...",Toxic
121720,THE ACLU IS ON YOUR ASS! \n\nThis latino will ...,Toxic


In [29]:
# Encoding the labels for classification
id2label = {0: "Not Toxic", 1: "Toxic"}
label2id = {"Not Toxic": 0, "Toxic": 1}
df_toxic_balanced["label"] = df_toxic_balanced["label"].apply(lambda x: label2id[x])

In [30]:
# Making the Hugging Face dataset
ds = Dataset.from_pandas(df_toxic_balanced)

  if _pandas_api.is_sparse(col):


In [34]:
ds

Dataset({
    features: ['comment_text', 'label', '__index_level_0__'],
    num_rows: 4000
})

### Tokenizing the data

In [35]:
# Pre-trained model from Hugging Face
pretrained_model = 'distilbert-base-cased'

In [36]:
# Initializing tokenizer from pretrained model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast =True)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [37]:
# Creating a function to map the tokenizer to our dataset (specifically the column comment_text)
def preprocess(x):
    return tokenizer(x["comment_text"], truncation=True, max_length=128)

In [38]:
# Applying the function 'preprocess' on our dataset
tok_ds = ds.map(preprocess, batched=True)
tok_ds

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Dataset({
    features: ['comment_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4000
})

In [50]:
# Creating data collator with padding using tokenizer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')

### Create Train and Test set

In [39]:
# Specifying the split: %70 train and %30 test
dataset = tok_ds.train_test_split(test_size=0.3)

In [40]:
dataset

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['comment_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1200
    })
})

In [41]:
tok_train_dataset = dataset["train"]
tok_test_dataset = dataset["test"]

### Creating an Evaluation Metric

In [42]:
# Loading accuracy metric for evaluation
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [43]:
# Evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

### Making the model

In [47]:
# Download the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=2, id2label=id2label, label2id=label2id, output_attentions=True)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight

In [48]:
# Defining the training arguements
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=True,
    num_train_epochs=2,
    weight_decay=0.01,
)

In [51]:
# Training the model on our data with our specific training arguements
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_train_dataset,
    eval_dataset=tok_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=350, training_loss=0.2803288922991071, metrics={'train_runtime': 150.4657, 'train_samples_per_second': 37.218, 'train_steps_per_second': 2.326, 'total_flos': 741817432473600.0, 'train_loss': 0.2803288922991071, 'epoch': 2.0})

In [52]:
# Saving the trained model along with other training-related information
trainer.save_model("comments_model")

In [53]:
# Saving the raw model object as is, without any additional information related to training
# We will use this for prediction
model_save_path = '/kaggle/working/comments_model.pkl'

with open(model_save_path, 'wb') as model_file:
    pickle.dump(model, model_file)

In [54]:
# Loading the raw model
model_pickle_path = '/kaggle/working/comments_model.pkl'

with open(model_pickle_path, 'rb') as model_file:
    model = pickle.load(model_file)

### Testing the model

In [62]:
# Load the pre-trained tokenizer and model
model_path = "/kaggle/working/comments_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
 
# Create a text classification pipeline using the loaded model and tokenizer
pipeline =  TextClassificationPipeline(model=model, tokenizer=tokenizer)

# Make predictions on sample texts and print the results
print(pipeline("You are beautiful"))
print(pipeline("You are ugly"))

[{'label': 'Not Toxic', 'score': 0.7726908326148987}]
[{'label': 'Toxic', 'score': 0.9823653101921082}]


In [63]:
# For extra insight, we look at the json file
import json

config_path = "/kaggle/working/comments_model/config.json"

with open(config_path, 'r') as config_file:
    config = json.load(config_file)

config


{'_name_or_path': 'distilbert-base-cased',
 'activation': 'gelu',
 'architectures': ['DistilBertForSequenceClassification'],
 'attention_dropout': 0.1,
 'dim': 768,
 'dropout': 0.1,
 'hidden_dim': 3072,
 'id2label': {'0': 'Not Toxic', '1': 'Toxic'},
 'initializer_range': 0.02,
 'label2id': {'Not Toxic': 0, 'Toxic': 1},
 'max_position_embeddings': 512,
 'model_type': 'distilbert',
 'n_heads': 12,
 'n_layers': 6,
 'output_attentions': True,
 'output_past': True,
 'pad_token_id': 0,
 'problem_type': 'single_label_classification',
 'qa_dropout': 0.1,
 'seq_classif_dropout': 0.2,
 'sinusoidal_pos_embds': False,
 'tie_weights_': True,
 'torch_dtype': 'float32',
 'transformers_version': '4.29.2',
 'vocab_size': 28996}