In [1]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
# import pytorch_lightning as pl
# from pytorch_lightning.metrics.functional.classification import auroc
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("train_preprocessed.csv")

###**Looking at our Data**

In [4]:
df.head()

Unnamed: 0,comment_text,id,identity_hate,insult,obscene,set,severe_toxic,threat,toxic,toxicity
0,explanation why the edits made under my userna...,0000997932d777bf,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
1,d aww he matches this background colour i m s...,000103f0d9cfb60f,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
2,hey man i m really not trying to edit war it...,000113f07ec002fd,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
3,more i can t make any real suggestions on im...,0001b41b1c6bb37e,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
4,you sir are my hero any chance you remember...,0001d958c54c6e35,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   comment_text   159571 non-null  object 
 1   id             159571 non-null  object 
 2   identity_hate  159571 non-null  float64
 3   insult         159571 non-null  float64
 4   obscene        159571 non-null  float64
 5   set            159571 non-null  object 
 6   severe_toxic   159571 non-null  float64
 7   threat         159571 non-null  float64
 8   toxic          159571 non-null  float64
 9   toxicity       159571 non-null  float64
dtypes: float64(7), object(3)
memory usage: 12.2+ MB


In [6]:
df.shape

(159571, 10)

In [7]:
df.isnull().sum()

comment_text     0
id               0
identity_hate    0
insult           0
obscene          0
set              0
severe_toxic     0
threat           0
toxic            0
toxicity         0
dtype: int64

In [8]:
train_df, test_df = train_test_split(df, test_size=0.1)
train_df.shape, test_df.shape

((143613, 10), (15958, 10))

In [9]:
CLASSES = df.columns.to_list()[2:]
CLASSES

['identity_hate',
 'insult',
 'obscene',
 'set',
 'severe_toxic',
 'threat',
 'toxic',
 'toxicity']

In [10]:
df[CLASSES].sum().sort_values().plot(kind="barh")

TypeError: '>' not supported between instances of 'numpy.ndarray' and 'str'

In [None]:
df[CLASSES].sum(), df.shape

(identity_hate                                               1405.0
 insult                                                      7877.0
 obscene                                                     8449.0
 set              traintraintraintraintraintraintraintraintraint...
 severe_toxic                                                1595.0
 threat                                                       478.0
 toxic                                                      15294.0
 toxicity                                                   35098.0
 dtype: object,
 (159571, 10))

In [None]:
# If we take a look closely at our dataset, all the labels comments are approximately 15k and clean are 10 times that.
toxic_df = df[df[CLASSES].sum(axis=1) > 0]
toxic_df.shape

In [None]:
clean_df = df[df[CLASSES].sum(axis=1) == 0]
clean_df.shape

In [None]:
clean_df = clean_df.sample(16_000)
clean_df.shape

(16000, 8)

In [None]:
train_df = pd.concat([toxic_df, clean_df])
train_df.shape

NameError: name 'toxic_df' is not defined

In [None]:
train_df[CLASSES].sum()

identity_hate                                               1283.0
insult                                                      7097.0
obscene                                                     7603.0
set              traintraintraintraintraintraintraintraintraint...
severe_toxic                                                1436.0
threat                                                       439.0
toxic                                                      13727.0
toxicity                                                   31585.0
dtype: object

In [None]:
BERT_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

###**Dataset Preparation**

In [None]:
class ToxicCommentsDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: BertTokenizer,
        max_len: int = 128
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        single_row = self.data.iloc[index]
        
        comment = single_row.comment_text
        labels = single_row[CLASSES]

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "comment_text": comment,
            "input_ids": encoding["input_ids"].flatten(), # [1,512] => [512]
            "attention_mask": encoding["attention_mask"].flatten(), # [1,512] => [512]
            "labels": torch.FloatTensor(labels)
        }

In [None]:
train_dataset = ToxicCommentsDataset(train_df, tokenizer)

In [None]:
sample_data = train_dataset[0]

TypeError: must be real number, not str

In [None]:
print(sample_data["comment_text"])
print()
print(sample_data["input_ids"])
print()
print(sample_data["attention_mask"])
print()
print(sample_data["labels"])

NameError: name 'sample_data' is not defined

In [None]:
class ToxicCommentDataModule(pl.LightningDataModule):
    def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_len=128):
        super().__init__()
        
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_len = max_len
    
    def setup(self):
        self.train_dataset = ToxicCommentsDataset(self.train_df, self.tokenizer, self.max_len)
        self.test_dataset = ToxicCommentsDataset(self.test_df, self.tokenizer, self.max_len)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )    
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )

NameError: name 'pl' is not defined

In [None]:
EPOCHS = 10
BATCH_SIZE = 32

In [None]:
data_module = ToxicCommentDataModule(
    train_df, 
    test_df, 
    tokenizer, 
    batch_size=BATCH_SIZE
)
data_module.setup()

###**MODEL BUILDING**

In [None]:
class ToxicCommentClassifier(pl.LightningModule):
    def __init__(self, n_classes: int, steps_per_epoch=None, n_epochs=None):
        super().__init__()

        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {
            "loss": loss,
            "predictions": outputs,
            "labels": labels
        }

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def training_epoch_end(self, outputs):
        labels = []
        predictions = []
        
        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)

            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)

        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)

        for i, name in enumerate(CLASSES):
            roc_score = auroc(predictions[:, i], labels[:, i])
            self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", roc_score, self.current_epoch)
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        warmup_steps = self.steps_per_epoch // 3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps
        scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        return [optimizer], [scheduler]

In [None]:
model = ToxicCommentClassifier(
    n_classes=len(CLASSES), 
    steps_per_epoch=len(train_df)//BATCH_SIZE, 
    n_epochs=EPOCHS
)

In [None]:
trainer = pl.Trainer(max_epochs=EPOCHS, gpus=1, progress_bar_refresh_rate=30)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 108 M 
1 | classifier | Linear    | 4.6 K 
2 | criterion  | BCELoss   | 0     
-----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.260   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  cpuset_checked))




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f555b24b7a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1297, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.7/multiprocessing/process.py", line 140, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.7/multiprocessing/popen_fork.py", line 45, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 921, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.7/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 


In [None]:
trainer.test()

In [None]:
trainer.save_checkpoint("last-checkpoint.ckpt")

###**Predictions**

In [None]:
trained_model = ToxicCommentClassifier.load_from_checkpoint("last-checkpoint.ckpt", n_classes=len(CLASSES))
trained_model.freeze()

In [None]:
test_example = "I dont like you, I hate your texts those are really bullshit!"

In [None]:
encoding = tokenizer.encode_plus(
    test_example,
    add_special_tokens=True,
    max_length=128,
    return_token_type_ids=False,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors="pt"
)

In [None]:
model.eval()
_, preds = model(encoding["input_ids"], encoding["attention_mask"])
preds = preds.flatten().detach().numpy()

In [None]:
predictions = []
for idx, label in enumerate(CLASSES):
    if preds[idx] > 0.5:
        predictions.append((label, round(preds[idx]*100, 2)))

predictions

[('toxic', 85.94), ('obscene', 53.17)]