# Addressing Online Hate Speech

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 8, 6

import spacy
import re, string
import boto3
#import gensim

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC, SVC

import warnings
warnings.filterwarnings('ignore')

import random
import time
import multiprocessing as mp

import mxnet as mx
import gluonnlp as nlp
from mxnet import nd, gluon, autograd

random.seed(123)
np.random.seed(123)
mx.random.seed(123)

### Helper functions

In [2]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

##############################################################################

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self
    
##############################################################################

def add_glove(X, dim=300): return np.mean([w2v[word] if word in w2v else np.zeros(dim) for word in tokenize(X)], axis=0)

def add_gensim(X, dim=300): return np.mean([gens.wv[word] if word in gens.wv else np.zeros(dim) for word in tokenize(X)], axis=0)

##############################################################################
## USED IN THE DEEP LEARNING CHUNK
##############################################################################

def evaluate(net, dataloader, context):
    total_L = 0.0
    total_sample_num = 0
    total_correct_num = 0
    start_log_interval_time = time.time()
    print('Begin Testing...')
    for i, ((data, valid_length), label) in enumerate(dataloader):
        data = mx.nd.transpose(data.as_in_context(context))
        valid_length = valid_length.as_in_context(context).astype(np.float32)
        label = label.as_in_context(context)
        output = net(data, valid_length)
        L = loss(output, label)
        pred = (output > 0.5)
        total_L += L.sum().asscalar()
        total_sample_num += (label.shape[0] * label.shape[1])
        total_correct_num += (pred == label).sum().asscalar()
        
        if i == 0:
            labels = label
            outputs = output
        else:
            labels = mx.ndarray.concatenate([labels, label])
            outputs = mx.ndarray.concatenate([outputs, output])
        
        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, len(dataloader),
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()
    avg_L = total_L / float(total_sample_num)
    acc = total_correct_num / float(total_sample_num)
    auc = roc_auc_score(labels.as_in_context(mx.cpu()).asnumpy(), 
                        outputs.as_in_context(mx.cpu()).asnumpy(), average='macro')
    return avg_L, acc, auc

##############################################################################

def get_dataloader():
    # Construct the DataLoader
    # Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, ret_length=True),
        nlp.data.batchify.Stack(dtype='float32'))
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_data_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    test_dataloader = gluon.data.DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, test_dataloader

##############################################################################

def train(net, context, epochs):
    trainer = gluon.Trainer(net.collect_params(), optim,
                            {'learning_rate': learning_rate})

    parameters = net.collect_params().values()

    # Training/Testing
    for epoch in range(epochs):
        # Epoch training stats
        start_epoch_time = time.time()
        epoch_L = 0.0
        epoch_sent_num = 0
        epoch_wc = 0
        # Log interval training stats
        start_log_interval_time = time.time()
        log_interval_wc = 0
        log_interval_sent_num = 0
        log_interval_L = 0.0

        for i, ((data, length), label) in enumerate(train_dataloader):
            L = 0
            wc = length.sum().asscalar()
            log_interval_wc += wc
            epoch_wc += wc
            log_interval_sent_num += data.shape[1]
            epoch_sent_num += data.shape[1]
            with autograd.record():
                output = net(data.as_in_context(context).T,
                             length.as_in_context(context).astype(np.float32))
                L = L + loss(output, label.as_in_context(context)).sum()
            L.backward()
            # Clip gradient
            if grad_clip:
                gluon.utils.clip_global_norm(
                    [p.grad(context) for p in parameters],
                    grad_clip)
            # Update parameter
            trainer.step(1)
            log_interval_L += L.asscalar()
            epoch_L += L.asscalar()
            if (i + 1) % log_interval == 0:
                print(
                    '[Epoch {} Batch {}/{}] elapsed {:.2f} s, '
                    'avg loss {:.6f}, throughput {:.2f}K wps'.format(
                        epoch, i + 1, len(train_dataloader),
                        time.time() - start_log_interval_time,
                        log_interval_L / log_interval_sent_num, log_interval_wc
                        / 1000 / (time.time() - start_log_interval_time)))
                # Clear log interval training stats
                start_log_interval_time = time.time()
                log_interval_wc = 0
                log_interval_sent_num = 0
                log_interval_L = 0
        end_epoch_time = time.time()
        test_avg_L, test_acc, test_auc = evaluate(net, test_dataloader, context)
        print('[Epoch {}] train avg loss {:.6f}, test acc {:.5f}, '
              'test avg loss {:.6f}, test auc {:.5f}'.format(
                  epoch, epoch_L / epoch_sent_num, test_acc, test_avg_L,
                  test_auc))
        
##############################################################################

### Loading the data

In [3]:
bucket = "eider-pochetti"
file_name = "train.csv"

s3 = boto3.client('s3') 
obj = s3.get_object(Bucket= bucket, Key= file_name) 
df = pd.read_csv(obj['Body'])

In [3]:
#df = pd.read_csv('./data/toxic/train.csv')#'C:\\Users\\pochetti\\WorkDocs\\Desktop\\Fra\\Francesco\\Kaggle\\toxic\\train.csv')

### Quick EDA and pre-processing

In [4]:
df.shape
df.head()

(159571, 8)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
not_dummies = ['id', 'comment_text']
other = df.loc[:,[col for col in df.columns if col not in not_dummies]].sum(axis=1)
df['other'] = np.where(other == 0, 1, 0)

df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,other
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


In [6]:
df.loc[:,[col for col in df.columns if col not in not_dummies]].sum(axis=0)

toxic             15294
severe_toxic       1595
obscene            8449
threat              478
insult             7877
identity_hate      1405
other            143346
dtype: int64

In [7]:
corr = df.loc[:,[col for col in df.columns if col not in not_dummies]].corr()
corr.style.background_gradient()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,other
toxic,1.0,0.308619,0.676515,0.157058,0.647518,0.266009,-0.967748
severe_toxic,0.308619,1.0,0.403014,0.123601,0.375807,0.2016,-0.298666
obscene,0.676515,0.403014,1.0,0.141179,0.741272,0.286867,-0.702812
threat,0.157058,0.123601,0.141179,1.0,0.150022,0.115128,-0.162925
insult,0.647518,0.375807,0.741272,0.150022,1.0,0.337736,-0.677324
identity_hate,0.266009,0.2016,0.286867,0.115128,0.337736,1.0,-0.280144
other,-0.967748,-0.298666,-0.702812,-0.162925,-0.677324,-0.280144,1.0


In [8]:
COMMENT = 'comment_text'
df[COMMENT].fillna("unknown", inplace=True)

### Splitting into train and test

In [9]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X_train, X_test, y_train, y_test = train_test_split(df[COMMENT], 
                                                    df.loc[:,[col for col in df.columns if col not in not_dummies]], 
                                                    test_size=0.25, random_state=4)

X_train.shape
y_train.shape

X_test.shape
y_test.shape

(119678,)

(119678, 7)

(39893,)

(39893, 7)

## Naive-Bayes-SVM on top of Tf-IDF (ROC-AUC: 0.98251)

In [10]:
n = X_train.shape[0]

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

trn_term_doc = vec.fit_transform(X_train)

In [11]:
test_term_doc = vec.transform(X_test)

In [12]:
trn_term_doc

<119678x339500 sparse matrix of type '<class 'numpy.float64'>'
	with 13193559 stored elements in Compressed Sparse Row format>

In [13]:
preds = np.zeros((len(X_test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m = NbSvmClassifier(C=4, dual=True, n_jobs=-1).fit(trn_term_doc, y_train[j])
    preds[:,i] = m.predict_proba(test_term_doc)[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [14]:
roc_auc_score(y_test[label_cols].values, preds, average='macro')

0.98251143449724354

## Logistic Regression on top of Tf-IDF (ROC-AUC: 0.97941)

In [15]:
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
clf.fit(trn_term_doc, y_train[label_cols])

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=-1)

In [16]:
roc_auc_score(y_test[label_cols].values, clf.predict_proba(test_term_doc), average='macro')

0.97941168593628047

## Logistic Regression on top of pre-trained Glove Word Embeddings (ROC-AUC: 0.94220)

In [82]:
words = pd.read_csv('C:\\Users\\pochetti\\WorkDocs\\Desktop\\Fra\\Francesco\\Copperfield\\glove.6B\\glove.6B.300d.txt', sep=' ', 
                    header=None, quoting = 3)
words = words.apply(pd.to_numeric, errors='ignore')
d = {'word': words.loc[:,0].tolist(), 'embedding': words.loc[:,1:].values.tolist()}
d50 = pd.DataFrame(data=d)
w2v = d50.set_index('word')['embedding'].to_dict()

In [85]:
len(w2v['book'])

300

In [93]:
tr = X_train.apply(add_glove)

In [107]:
tra = np.array(tr.tolist())

In [108]:
glo = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
glo.fit(tra, y_train[label_cols])

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=-1)

In [109]:
tst = X_test.apply(add_glove)
tsta = np.array(tst.tolist())

In [111]:
roc_auc_score(y_test[label_cols].values, glo.predict_proba(tsta), average='macro')

0.942208144506225

## Logistic Regression on top of ad-hoc trained Gensim Embeddings (ROC-AUC: 0.95545)

In [116]:
l = df.comment_text.apply(tokenize)

In [120]:
documents = l.tolist()

In [134]:
gens = gensim.models.Word2Vec(documents, size=300, window=10, min_count=2, workers=10)

gens.train(documents, total_examples=len(documents), epochs=10)

(97376954, 136103550)

In [135]:
gens.wv['computer'].shape

(300,)

In [136]:
gens.save("word2vecgensim.model")

In [140]:
tr_gensim = X_train.apply(add_gensim)

In [141]:
tra_gensim = np.array(tr_gensim.tolist())

In [142]:
g = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
g.fit(tra_gensim, y_train[label_cols])

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=-1)

In [143]:
tst_gensim = X_test.apply(add_gensim)
tsta_gensim = np.array(tst_gensim.tolist())

In [144]:
roc_auc_score(y_test[label_cols].values, g.predict_proba(tsta_gensim), average='macro')

0.9554535195331965

## Transfer Learning with GluonNLP

In [81]:
class MeanPoolingLayer(gluon.HybridBlock):
    """A block for mean pooling of encoder features"""
    def __init__(self, prefix=None, params=None):
        super(MeanPoolingLayer, self).__init__(prefix=prefix, params=params)

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        """Forward logic"""
        # Data will have shape (T, N, C)
        masked_encoded = F.SequenceMask(data,
                                        sequence_length=valid_length,
                                        use_sequence_length=True)
        agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
                                    F.expand_dims(valid_length, axis=1))
        return agg_state


class SentimentNet(gluon.HybridBlock):
    """Network for sentiment analysis."""
    def __init__(self, dropout, prefix=None, params=None):
        super(SentimentNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            self.embedding = None # will set with lm embedding later
            self.encoder = None # will set with lm encoder later
            self.agg_layer = MeanPoolingLayer()
            self.output = gluon.nn.HybridSequential()
            with self.output.name_scope():
                #self.output.add(gluon.nn.BatchNorm(axis=1, center=True, scale=True))
                self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(len(label_cols), flatten=False, activation='sigmoid'))

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        encoded = self.encoder(self.embedding(data))  # Shape(T, N, C)
        agg_state = self.agg_layer(encoded, valid_length)
        out = self.output(agg_state)
        return out

In [82]:
dropout = 0.5
language_model_name = 'standard_lstm_lm_200'
pretrained = True
learning_rate, batch_size = 0.005, 16
bucket_num, bucket_ratio = 10, 0.2
epochs = 3
grad_clip = None
log_interval = 1000
context = mx.gpu(0)
loss = gluon.loss.SigmoidBCELoss(from_sigmoid=True)
optim = 'adam'

In [83]:
lm_model, vocab = nlp.model.get_model(name=language_model_name,
                                      dataset_name='wikitext-2',
                                      pretrained=pretrained,
                                      ctx=context,
                                      dropout=dropout)

In [84]:
net = SentimentNet(dropout=dropout)
net.embedding = lm_model.embedding
net.encoder = lm_model.encoder
net.hybridize()
net.output.initialize(mx.init.Xavier(), ctx=context)
print(net)

SentimentNet(
  (embedding): HybridSequential(
    (0): Embedding(33278 -> 200, float32)
    (1): Dropout(p = 0.5, axes=())
  )
  (encoder): LSTM(200 -> 200, TNC, num_layers=2, dropout=0.5)
  (agg_layer): MeanPoolingLayer(
  
  )
  (output): HybridSequential(
    (0): Dropout(p = 0.5, axes=())
    (1): Dense(None -> 6, Activation(sigmoid))
  )
)


In [85]:
tokenizer = nlp.data.SpacyTokenizer('en')
length_clip = nlp.data.ClipSequence(500)

def my_tokens(s): return vocab[length_clip(tokenize(s))]

In [86]:
X_train_tok = X_train.apply(my_tokens)
X_test_tok = X_test.apply(my_tokens)

In [88]:
train_dataset = mx.gluon.data.ArrayDataset(X_train_tok.values, y_train[label_cols].values)
train_data_lengths = X_train_tok.str.len().values

test_dataset = mx.gluon.data.ArrayDataset(X_test_tok.values, y_test[label_cols].values)
test_data_lengths = X_test_tok.str.len().values

In [89]:
train_dataloader, test_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=119678, batch_num=5668
  key=[59, 108, 157, 206, 255, 304, 353, 402, 451, 500]
  cnt=[71327, 22968, 9781, 5105, 3296, 1834, 1273, 727, 583, 2784]
  batch_size=[27, 16, 16, 16, 16, 16, 16, 16, 16, 16]


In [93]:
train(net, context, epochs=5)

[Epoch 0 Batch 1000/5668] elapsed 67.39 s, avg loss 0.019530, throughput 25.71K wps
[Epoch 0 Batch 2000/5668] elapsed 66.42 s, avg loss 0.015655, throughput 26.50K wps
[Epoch 0 Batch 3000/5668] elapsed 65.71 s, avg loss 0.014912, throughput 26.05K wps
[Epoch 0 Batch 4000/5668] elapsed 64.37 s, avg loss 0.014517, throughput 25.24K wps
[Epoch 0 Batch 5000/5668] elapsed 64.94 s, avg loss 0.014598, throughput 25.71K wps
Begin Testing...
[Batch 1000/2494] elapsed 70.14 s
[Batch 2000/2494] elapsed 70.17 s
[Epoch 0] train avg loss 0.015724, test acc 0.97182, test avg loss 0.013366, test auc 0.95026
[Epoch 1 Batch 1000/5668] elapsed 65.81 s, avg loss 0.014073, throughput 24.81K wps
[Epoch 1 Batch 2000/5668] elapsed 66.53 s, avg loss 0.012863, throughput 26.44K wps
[Epoch 1 Batch 3000/5668] elapsed 65.43 s, avg loss 0.013637, throughput 25.96K wps
[Epoch 1 Batch 4000/5668] elapsed 65.38 s, avg loss 0.014046, throughput 25.90K wps
[Epoch 1 Batch 5000/5668] elapsed 64.90 s, avg loss 0.013654, thr