Ensemble Methods

In [1]:
!pip install -q -U "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 6.1 MB/s 
[?25h

In [2]:
!pip install -q tf-models-official==2.7.0

[K     |████████████████████████████████| 1.8 MB 8.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 53.3 MB/s 
[K     |████████████████████████████████| 596 kB 70.8 MB/s 
[K     |████████████████████████████████| 92 kB 12.7 MB/s 
[K     |████████████████████████████████| 352 kB 37.5 MB/s 
[K     |████████████████████████████████| 48.3 MB 77 kB/s 
[K     |████████████████████████████████| 237 kB 53.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 52.2 MB/s 
[K     |████████████████████████████████| 43 kB 702 kB/s 
[K     |████████████████████████████████| 99 kB 9.4 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

Mount Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load BERT model

In [5]:
saved_model_path = "/content/drive/MyDrive/Toxics/bert_baseline_v2"
load_options = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
bert_model = tf.saved_model.load(saved_model_path, options=load_options)

Test BERT model

In [None]:
def print_my_examples(inputs, results):
  result_for_printing = \
    [f'input: {inputs[0]:<30} : score: {results[0]} {results[0][4]:.6f}']
  print(*result_for_printing, sep='\n')
  print()


examples = ["""stop editing my edits you are stupid"""]

reloaded_results = tf.sigmoid(bert_model(tf.constant(examples)))
result = [0, 0, 0, 0, 0, 0]
for i in range(6):
  if(reloaded_results[0][i] > 0.5):
    result[i] = 1
print('Results from the saved model:')
print_my_examples(examples, reloaded_results)
print(result)

Results from the saved model:
input: stop editing my edits you are stupid : score: [9.8825884e-01 9.9684624e-03 6.7033315e-01 1.8299686e-03 9.2444152e-01
 7.5031433e-04] 0.924442

[1, 0, 1, 0, 1, 0]


Load T5 model

In [6]:
!pip install transformers
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.7.0 tokenizers-0.12.1 transformers-4.19.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-1.6.4-py3-none-any.whl (585 kB)
[K     |████████████████████████████████| 58

In [7]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hp):
    super(T5FineTuner, self).__init__()
    self.hp = hp
    
    self.model = T5ForConditionalGeneration.from_pretrained(hp.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hp.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.global_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"].to(device)
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"].to(device),
        attention_mask=batch["source_mask"].to(device),
        labels=labels,
        decoder_attention_mask=batch['target_mask'].to(device)
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    # return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hp.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hp.learning_rate, eps=self.hp.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, on_tpu = None, second_order_closure=None):
  #   if self.trainer.use_tpu:
  #     xm.optimizer_step(optimizer)
  #   else:
  #     optimizer.step()
  #   optimizer.zero_grad()
  #   self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(self.tokenizer, train_df, self.hp)
    dataloader = DataLoader(train_dataset, batch_size=self.hp.train_batch_size, drop_last=True, shuffle=True, num_workers=2)
    t_total = (
        (len(dataloader.dataset) // (self.hp.train_batch_size * max(1, self.hp.n_gpu)))
        // self.hp.gradient_accumulation_steps
        * float(self.hp.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hp.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(self.tokenizer, val_df, self.hp)
    return DataLoader(val_dataset, batch_size=self.hp.eval_batch_size, num_workers=2)

In [9]:
PATH = F"/content/drive/MyDrive/Toxics/T5/classifier_v2.pt"
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)
args = argparse.Namespace(**args_dict)
T5_model = T5FineTuner(args)
T5_model.load_state_dict(torch.load(PATH))

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


<All keys matched successfully>

Testing T5

In [10]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
input = tokenizer("""Stop editing my edits you are not so smart you know""", return_tensors="pt")
outputs = T5_model.model.generate(input_ids = input.input_ids, attention_mask = input.attention_mask)
dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces = True) for ids in outputs]
# labels = ['toxic', 'severe', 'threat', 'insult', 'identity', 'obscene']
# result = [0, 0, 0, 0, 0, 0]
# for i in range(len(labels)):
#   for line in dec:
#     if labels[i] in line:
#       result[i] = 1
print(dec)

['none']


Load BERT with a decoder layer

In [12]:
saved_model_path = "/content/drive/MyDrive/Toxics/bert_2decoders"
load_options = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
bert_decoder_model = tf.saved_model.load(saved_model_path, options=load_options)

Test BERT with a decoder layer

In [None]:
def print_my_examples(inputs, results):
  result_for_printing = \
    [f'input: {inputs[0]:<30} : score: {results[0]} {results[0]}']
  print(*result_for_printing, sep='\n')
  print()


examples = ["""if you ever try this at home, you might die"""]

reloaded_results = tf.sigmoid(bert_decoder_model(tf.constant(examples)))
result = [0, 0, 0, 0, 0, 0]
for i in range(6):
  if(reloaded_results[0][i] > 0.5):
    result[i] = 1
print('Results from the saved model:')
print_my_examples(examples, reloaded_results)
print(result)

Results from the saved model:
input: if you ever try this at home, you might die : score: [0.94136035 0.0417825  0.06231308 0.7081389  0.03272599 0.00340896] [0.94136035 0.0417825  0.06231308 0.7081389  0.03272599 0.00340896]

[1, 0, 0, 1, 0, 0]


Now, lets write a function that combines them

In [18]:
def ensemble_inference(example):
  #BERT
  bert_results = [0, 0, 0, 0, 0, 0]
  r = tf.sigmoid(bert_model(tf.constant([example])))
  for i in range(6):
    if(r[0][i] > 0.5):
      bert_results[i] = 1
    else:
      bert_results[i] = 0
  #BERT_DECODER
  bert_decoder_results = [0, 0, 0, 0, 0, 0] 
  r = tf.sigmoid(bert_decoder_model(tf.constant([example])))
  for i in range(6):
    if(r[0][i] > 0.5):
      bert_decoder_results[i] = 1
    else:
      bert_decoder_results[i] = 0
  #T5
  input = tokenizer(example, return_tensors="pt")
  outputs = T5_model.model.generate(input_ids = input.input_ids, attention_mask = input.attention_mask)
  dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces = True) for ids in outputs]
  labels = ['toxic', 'severe', 'obscene', 'threat', 'insult', 'identity']
# list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

  T5_results = [0, 0, 0, 0, 0, 0]
  for i in range(len(labels)):
    for line in dec:
      if labels[i] in line:
        T5_results[i] = 1
  results = [0, 0, 0, 0, 0, 0]
  results[0] = bert_results[0]
  results[1] = bert_results[1]
  results[5] = bert_results[5]
  for i in [2, 3, 4]:
    if(bert_results[i] + bert_decoder_results[i] + T5_results[i] > 1):
      results[i] = 1
  return results



Evaluation

Load the testing dataset

In [19]:
!pip install pandas
import pandas as pd

infile = r'/content/drive/MyDrive/Toxics/Dataset/test.csv'
test = pd.read_csv(infile, nrows=7978)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Test it out

In [20]:
from sklearn import metrics
test_sentences = test["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
test_y = test[list_classes].values

outputs = []
for sentence in test_sentences:
  outputs.append(ensemble_inference(sentence))

metrics.accuracy_score(test_y, outputs)
print(metrics.classification_report(test_y, outputs))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79       777
           1       0.57      0.25      0.34        65
           2       0.80      0.86      0.83       404
           3       0.77      0.34      0.48        29
           4       0.71      0.74      0.73       378
           5       0.67      0.41      0.50        74

   micro avg       0.76      0.76      0.76      1727
   macro avg       0.72      0.57      0.61      1727
weighted avg       0.75      0.76      0.75      1727
 samples avg       0.07      0.07      0.07      1727



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(metrics.accuracy_score(test_y, outputs))


0.8990975181749812


What if we just zero?

In [17]:
test_sentences = test["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
test_y = test[list_classes].values

outputs = []
for sentence in test_sentences:
  outputs.append([0, 0, 0, 0, 0, 0])
print(metrics.accuracy_score(test_y, outputs))
print(metrics.classification_report(test_y, outputs))

0.8978440711957885
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       777
           1       0.00      0.00      0.00        65
           2       0.00      0.00      0.00       404
           3       0.00      0.00      0.00        29
           4       0.00      0.00      0.00       378
           5       0.00      0.00      0.00        74

   micro avg       0.00      0.00      0.00      1727
   macro avg       0.00      0.00      0.00      1727
weighted avg       0.00      0.00      0.00      1727
 samples avg       0.00      0.00      0.00      1727



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
