In [1]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# A Simple RNN Pipeline for Comment Toxicity Classification
> May 25, 2019

> ref: https://www.kaggle.com/bminixhofer/simple-lstm-pytorch-version

In [2]:
def seed_everything(seed=1234): # for reproducibility
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [3]:
# Global params
GLOVE_EMBEDDING_PATH = '/Users/elenabg/Documents/6Q/AML/Project/glove.840B.300d.txt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220
max_features = None

In [4]:
# Helper Funcs for word embeddings

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

# Model Definition: LSTM

In [5]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

# Model Definition: GRU

In [6]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
class NeuralNetGRU(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNetGRU, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.gru = nn.GRU(embed_size, LSTM_UNITS)
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_gru, _ = self.gru(h_embedding)
        
        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

# Train Model

In [7]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)            
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()
            
            
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))
    
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds

# Text Preprocessing Pipeline

In [8]:
sys.path.insert(0, '/Users/elenabg/')
import text_cleaner as pre

def preprocess(data):
    '''
    Cleans comment text by:
    1) removing selected punctuation marks, 
    2) homogenezing contractions,
    3) homogenezing selected proper names,
    4) correcting selected misspellings
    '''

    data = data.astype(str).apply(lambda x: pre.clean_special_chars(x))
    data = data.astype(str).apply(lambda x: pre.clean_contractions_and_spelling(x))
    return data

# Load and Subset Datasets

In [9]:
FRAC = 0.02
tokenizer = text.Tokenizer()
max_features = None #327576
TRAIN_PATH='/Users/elenabg/DetoxiPy/train.csv'
TEST_PATH='/Users/elenabg/DetoxiPy/test.csv'

def build_datasets(tokenizer, frac=FRAC, train_frac=0.7, train_path=TRAIN_PATH,
                   test_path=TEST_PATH):
    
    df = pd.read_csv(train_path).sample(frac=frac)
    
    # divide into train and test
    msk = np.random.rand(len(df)) < train_frac
    train = df[msk]
    test = df[~msk]
    #test = pd.read_csv(test_path).sample(frac=frac)

    x_train = preprocess(train['comment_text']) # our own pre-processing pipeline goes here
    y_train = np.where(train['target'] >= 0.5, 1, 0)
    y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
    x_test = preprocess(test['comment_text']) # same
    tokenizer.fit_on_texts(list(x_train) + list(x_test))
    x_train = tokenizer.texts_to_sequences(x_train)
    x_test = tokenizer.texts_to_sequences(x_test)
    x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
    x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)
    return x_train, x_test, y_train, y_aux_train, tokenizer, train, test

In [10]:
x_train, x_test, y_train, y_aux_train, tokenizer_trained, train, test = build_datasets(tokenizer, frac=FRAC,  
                                               train_path=TRAIN_PATH, test_path=TEST_PATH)

# Train Embedding

In [11]:
max_features = max_features or len(tokenizer_trained.word_index) + 1 

In [12]:
glove_matrix, unknown_words_glove = build_matrix(tokenizer_trained.word_index, GLOVE_EMBEDDING_PATH)
print('\n unknown words (glove): ', len(unknown_words_glove))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



 unknown words (glove):  6569


# Build Model

In [13]:
def build_model(x_train, y_train, x_test, y_aux_train, NUM_MODELS, glove_matrix,
                model_type='LSTM'):
    
    x_train_torch = torch.tensor(x_train, dtype=torch.long)
    x_test_torch = torch.tensor(x_test, dtype=torch.long)
    y_train_torch = torch.tensor(np.hstack([y_train[:, np.newaxis], y_aux_train]),
                                 dtype=torch.float32)
    train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
    test_dataset = data.TensorDataset(x_test_torch)

    all_test_preds = []

    for model_idx in range(NUM_MODELS):
        print('Model ', model_idx)
        seed_everything(1234 + model_idx)
        if model_type=='LSTM':
            model = NeuralNet(glove_matrix, y_aux_train.shape[-1]) 
        else:
            model = NeuralNetGRU(glove_matrix, y_aux_train.shape[-1])
        #model.cuda()

        test_preds = train_model(model, train_dataset, test_dataset,
                                 output_dim=y_train_torch.shape[-1], 
                                 loss_fn=nn.BCEWithLogitsLoss(reduction='mean'))
        all_test_preds.append(test_preds)
        print()
    return all_test_preds

# Test Model

In [14]:
# 0.02 rand sample dataset
%time all_test_preds = build_model(x_train, y_train, x_test, y_aux_train, NUM_MODELS, glove_matrix)

Model  0


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 1/4 	 loss=0.2083 	 time=608.98s


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 2/4 	 loss=0.1450 	 time=611.80s


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 3/4 	 loss=0.1278 	 time=614.84s


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 4/4 	 loss=0.1241 	 time=613.90s

Model  1


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 1/4 	 loss=0.2078 	 time=616.65s


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 2/4 	 loss=0.1418 	 time=663.63s


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 3/4 	 loss=0.1268 	 time=613.08s


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Epoch 4/4 	 loss=0.1236 	 time=640.43s

CPU times: user 2h 22min 5s, sys: 13min 14s, total: 2h 35min 20s
Wall time: 1h 23min 3s


In [15]:
res = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': np.mean(all_test_preds, axis=0)[:, 0]})

In [16]:
tox = res[res['prediction']>=0.5]

In [17]:
# Comments classified as toxic by the model

tox 

Unnamed: 0,id,prediction
671894,1063508,0.524976
1028842,5375010,0.607761
763519,5054605,0.540545
1364638,5784421,0.719910
607171,985174,0.650491
903967,5225738,0.630510
236996,533194,0.560676
939528,5268193,0.531823
1650330,6144914,0.549486
834071,5141395,0.517363


# Performance Metrics

In [31]:
def get_perf(test, thresh=0.5):
    accuracy, precision, recall = None, None, None
    test = test.loc[:, 'id':'comment_text']
    test['probs'] = res['prediction']
    test['preds'] = test['probs'].apply(lambda x: 1 if x >= thresh else 0)
    test['true'] = test['target'].apply(lambda x: 1 if x >= thresh else 0)
    test['correct'] = test['true']==test['preds']
    test['correct'] = test['correct'].apply(lambda x: 1 if x == True else 0)
    test1prec = test[test['preds'] == 1]
    test1rec = test[(test['true'] == 1) & (test['preds'] == 0)]
    accuracy = test['correct'].sum()/len(test) 
    lenp, lenr, = len(test1prec), len(test1rec)
    if lenp > 0:
        precision = test1prec['correct'].sum()/lenp
    if lenr > 0:
        recall = test1prec['correct'].sum()/(test1prec['correct'].sum() + lenr) 
    f1 = 2*((precision*recall)/(precision+recall))
    print("Accuracy: {} \n Precision: {}, Recall: {}, F1 score: {}".format(accuracy, precision, recall, f1))
    return accuracy, precision, recall, f1

In [32]:
get_perf(test) # very small sample -> no toxic comments were randomly picked, but code works

Accuracy: 0.9408825978351374 
 Precision: 0.7648725212464589, Recall: 0.3268765133171913, F1 score: 0.45801526717557245


(0.9408825978351374,
 0.7648725212464589,
 0.3268765133171913,
 0.45801526717557245)

In [None]:
# Next steps:

# - flexib to choose GRU
# - build grid to tune hyperparams
# - concat =/= embeds, maybe weighted avg 
# - assessing bias 

In [28]:
test

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
904703,5226678,0.200000,Lol...sounds like someone IS brainwashed but I...,0.000000,0.000000,0.000000,0.200000,0.000000,,,...,332690,approved,1,0,1,6,2,0.0,0,10
401755,734360,0.000000,8a welfare native corporations.,0.000000,0.000000,0.000000,0.000000,0.000000,,,...,157569,approved,0,0,0,2,1,0.0,0,4
162713,440814,0.111111,"Yes, it does and please note that those evokin...",0.000000,0.111111,0.000000,0.000000,0.000000,,,...,144158,approved,0,0,0,0,0,0.0,0,9
1170549,5546499,0.200000,President man-baby and Mr. Exxon Valdez negoti...,0.000000,0.000000,0.000000,0.200000,0.000000,0.0,0.0,...,352764,approved,2,0,0,5,2,0.0,10,5
1272446,5669604,0.500000,"Well, I guess it didn't matter so much when Ob...",0.000000,0.100000,0.100000,0.400000,0.000000,0.0,0.0,...,359868,approved,0,0,0,5,0,0.0,6,10
33722,283124,0.000000,I agree. I know Mr. Folger personally and he i...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,99079,approved,0,0,0,1,0,0.0,10,4
713438,4995465,0.000000,"""Those that hang around downtown all day"" are ...",0.000000,0.000000,0.000000,0.000000,0.000000,,,...,318773,approved,0,0,0,1,0,0.0,0,4
1008419,5350527,0.000000,But my first response is beside the point: I a...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,340172,approved,0,0,0,0,0,0.0,4,4
578789,950419,0.600000,The best thing since Kennedy's assassination? ...,0.000000,0.000000,0.400000,0.400000,0.000000,0.0,0.0,...,166201,approved,0,0,0,2,0,0.0,5,10
729212,5014078,0.200000,Wilders gained 5 seats from the last election ...,0.000000,0.000000,0.000000,0.200000,0.000000,0.0,0.0,...,319919,rejected,0,0,0,0,0,0.0,10,5
