In [None]:
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import torch
import time

In [None]:
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased')]

In [None]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights)

In [None]:
labels_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def tokenize_text(text):
    
    max_input_size = tokenizer.max_model_input_sizes['bert-base-uncased']
    
    tokenized_text = tokenizer.tokenize(text)
    tokenized_text = tokenized_text[:max_input_size - 2]

    return tokenized_text

In [None]:
from torchtext.data import BucketIterator, TabularDataset, Dataset, Field, LabelField

TEXT = Field(batch_first=True, 
             use_vocab=False,
             tokenize=tokenize_text, 
             preprocessing=tokenizer.convert_tokens_to_ids, 
             init_token=tokenizer.cls_token_id,
             eos_token=tokenizer.sep_token_id, 
             pad_token=tokenizer.pad_token_id, 
             unk_token=tokenizer.unk_token_id)

LABEL = LabelField(dtype=torch.float)

train_fields = [("id", None), 
                ("comment_text", TEXT), 
                ("toxic", LABEL),
                ("severe_toxic", LABEL),
                ("obscene", LABEL), 
                ("threat", LABEL),
                ("insult", LABEL),
                ("identity_hate", LABEL)]

data = TabularDataset('/content/drive/My Drive/data/train.csv',
                      format='csv', 
                      fields=train_fields, 
                      skip_header=True)

In [None]:
train_data, test_data = data.split(0.9, random_state=random.seed(42))
train_data, valid_data = train_data.split(0.9, random_state=random.seed(42))

LABEL.build_vocab(train_data)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size=256, device=device, sort=False)

In [None]:
print(len(train_data), len(valid_data), len(test_data))

129253 14361 15957


In [None]:
# small_train_data, train_data = train_data.split(0.2)
# small_train_iterator = BucketIterator(small_train_data, batch_size=256, device=device)

In [None]:
# len(small_train_data)

In [None]:
vars(train_data.examples[0])

{'comment_text': [7065,
  8743,
  1999,
  2189,
  1010,
  2048,
  21817,
  2015,
  4606,
  1037,
  4100,
  5524,
  19635,
  1012,
  6203,
  5980,
  2135,
  2239,
  1010,
  1045,
  5993,
  2007,
  2017,
  2008,
  2302,
  1037,
  12667,
  2057,
  2323,
  2025,
  2421,
  1037,
  2843,
  1997,
  3793,
  2006,
  2023,
  1012,
  2174,
  1010,
  2130,
  2065,
  1045,
  3685,
  2424,
  1037,
  12667,
  2157,
  2085,
  1010,
  1996,
  2755,
  3685,
  2022,
  1999,
  7593,
  2144,
  2009,
  2003,
  2004,
  9398,
  2004,
  1015,
  1009,
  1015,
  1027,
  1016,
  1010,
  2005,
  2029,
  1045,
  2052,
  2025,
  2113,
  2129,
  2000,
  2424,
  1037,
  12667,
  1012,
  2071,
  2057,
  2025,
  5993,
  2006,
  2164,
  2023,
  2028,
  6251,
  1999,
  1996,
  11621,
  29426,
  2015,
  2930,
  1010,
  5327,
  2008,
  2619,
  2097,
  2424,
  2023,
  3716,
  6179,
  1998,
  1013,
  2030,
  2097,
  2424,
  1037,
  3120,
  2005,
  2009,
  1029,
  2057,
  2071,
  22476,
  1996,
  1063,
  1063,
  2755,
  1065,


In [None]:
# https://github.com/keitakurita/

class CustomDataloader:
    def __init__(self, iterator, x, y):
        self.iterator, self.x, self.y = iterator, x, y # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.iterator:
            x = getattr(batch, self.x) # we assume only one input in this wrapper
            
            if self.y is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.iterator)

In [None]:
train_dl = CustomDataloader(train_iterator, "comment_text", labels_list)
valid_dl = CustomDataloader(valid_iterator, "comment_text", labels_list)
test_dl  = CustomDataloader(test_iterator, "comment_text", labels_list)

# small_train_dl = CustomDataloader(small_train_iterator, "comment_text", labels_list)

In [None]:
next(iter(train_dl))

(tensor([[  101, 17813,  2515,  ...,     0,     0,     0],
         [  101, 24975,  6581,  ...,     0,     0,     0],
         [  101,  1000, 26203,  ...,     0,     0,     0],
         ...,
         [  101,  1000,  1045,  ...,     0,     0,     0],
         [  101,  1045,  2333,  ...,     0,     0,     0],
         [  101, 15333,  6806,  ...,     0,     0,     0]], device='cuda:0'),
 tensor([[1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]], device='cuda:0'))

In [None]:
class linear_model(nn.Module):
    def __init__(self, bert_model, num_labels):
        super().__init__()
        
        embed_size = bert_model.config.hidden_size
        dropout_prob = bert_model.config.hidden_dropout_prob
        
        self.bert = bert_model   
        
        self.pre_classifier = nn.Linear(embed_size, embed_size)
        
        self.dropout = nn.Dropout(0.2)
        
        self.classifier = nn.Linear(embed_size, num_labels)
        
    def forward(self, x):
        
        # Get BERT embeddings for input_ids 
        with torch.no_grad():
            # (batch_size, seq_len, hidden_size)
            hidden = self.bert(x)[0] 
        
        # (batch_size, hidden_size)
        hidden = hidden[:,0]
        
        # (batch_size, hidden_size)
        pooled_output = self.pre_classifier(hidden)  
        # (batch_size, hidden_size)
        pooled_output = nn.ReLU()(pooled_output)  
        # (batch_size, hidden_size)
        pooled_output = self.dropout(pooled_output)  
        # (batch_size, hidden_size)
        logits = self.classifier(pooled_output)  

        return logits


In [None]:
model = linear_model(bert_model, len(labels_list))

In [None]:
for name, parameters in model.named_parameters():  
    if name.startswith('bert'):
        parameters.requires_grad = False

In [None]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 595,206 trainable parameters


In [None]:
def multilabel_acc(pred, y):

    pred = pred.sigmoid()
    pred = pred.detach().cpu().numpy()
    y = y.detach().cpu().numpy()
    outputs = np.argmax(pred, axis=1)
    real_vals = np.argmax(y, axis=1)
    return np.mean(outputs == real_vals)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    train_loss = 0
    train_acc = 0
    
    model.train()
    
    for x, y in iterator:
        
        optimizer.zero_grad()

        outputs = model(x)
        
        # loss for one batch in iterator
        loss = criterion(outputs, y)
        
        # acc for one batch in iterator
        acc = multilabel_acc(outputs, y)

        loss.backward()
        
        optimizer.step()
        
        # accumulate accuracies and losses
        train_loss += loss.item()
        train_acc += acc.item()

    return train_loss / len(iterator) , train_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    eval_loss = 0
    eval_acc = 0

    model.eval()

    with torch.no_grad():
        for x, y in iterator:

            outputs = model(x)

            loss = criterion(outputs, y)

            acc = multilabel_acc(outputs, y)

            eval_loss += loss.item()    
            eval_acc += acc.item()

    return eval_loss / len(iterator), eval_acc / len(iterator)

In [None]:
epochs = 5

for epoch in range(epochs):
    
    start_epoch = time.time()
    
    train_loss, train_acc = train(model, train_dl, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dl, criterion)
    
    end_epoch = time.time()

    best_loss = float("inf")
    if valid_loss < best_loss:
      best_loss = valid_loss
      torch.save(model.state_dict(), "/content/drive/My Drive/data/toxic_model.pt")
    
    print(f"Epoch: {epoch+1:02} | Time: {int((end_epoch - start_epoch) / 60)}m")
    print(f"Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%")
    print(f"Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc * 100:.2f}%")

Epoch: 01 | Time: 40m
Train Loss: 0.134 | Train Acc: 93.22%
Valid Loss: 0.114 | Valid Acc: 99.42%
Epoch: 02 | Time: 40m
Train Loss: 0.124 | Train Acc: 99.26%
Valid Loss: 0.119 | Valid Acc: 99.42%
Epoch: 03 | Time: 40m
Train Loss: 0.123 | Train Acc: 99.30%
Valid Loss: 0.117 | Valid Acc: 99.42%
Epoch: 04 | Time: 40m
Train Loss: 0.122 | Train Acc: 99.26%
Valid Loss: 0.109 | Valid Acc: 98.86%
Epoch: 05 | Time: 40m
Train Loss: 0.122 | Train Acc: 99.19%
Valid Loss: 0.110 | Valid Acc: 99.42%


In [None]:
model.load_state_dict(torch.load("/content/drive/My Drive/data/toxic_model.pt"))

test_loss, test_acc = evaluate(model, test_dl, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.106 | Test Acc: 99.46%


In [None]:
def predict(model, tokenizer, text):
    
    model.eval()
    
    tokenized = tokenizer.encode(text, add_special_tokens=True)
    tok_tensor = torch.tensor(tokenized).to(device)
    tok_tensor = tok_tensor.unsqueeze(0)
    logits = model(tok_tensor)
    pred = torch.sigmoid(logits)
    pred = pred.detach().cpu().numpy()
    
    result_df = pd.DataFrame(pred, columns=labels_list)
    results = result_df.to_dict("record")

    return [sorted(x.items(), key=lambda kv: kv[1], reverse=True) for x in results][0]

In [None]:
predict(model, tokenizer, "i love you")

[('toxic', 0.12670782208442688),
 ('insult', 0.059123363345861435),
 ('obscene', 0.03964898735284805),
 ('threat', 0.01795859821140766),
 ('severe_toxic', 0.009807408787310123),
 ('identity_hate', 0.003980768844485283)]