In [None]:
!pip install pytorch-lightning==1.1.8 --quiet
!pip install fasttext==0.9.2

In [2]:
import pandas as pd
import numpy as np
import csv
import urllib.request, zipfile, os
import time
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
import pickle, gc

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

In [3]:
mkdir data

## Load dataframes

In [4]:
if not os.path.exists('./data/attack_annotations.tsv'):
  file_path = 'data/4054689.zip'
  urllib.request.urlretrieve('https://ndownloader.figshare.com/articles/4054689/versions/6', file_path)
  with zipfile.ZipFile(file_path, 'r') as zip_ref:
      zip_ref.extractall('data')

  file_path = 'data/4267550.zip'
  urllib.request.urlretrieve('https://ndownloader.figshare.com/articles/4267550/versions/5', file_path)
  with zipfile.ZipFile(file_path, 'r') as zip_ref:
      zip_ref.extractall('data')

  file_path = 'data/4563973.zip'
  urllib.request.urlretrieve('https://ndownloader.figshare.com/articles/4563973/versions/2', file_path)
  with zipfile.ZipFile(file_path, 'r') as zip_ref:
      zip_ref.extractall('data')

In [5]:
aggression_data = pd.read_csv('./data/aggression_annotated_comments.tsv', sep='\t')
aggression_annotations = pd.read_csv('./data/aggression_annotations.tsv', sep='\t')
aggression_worker_demographics = pd.read_csv('./data/aggression_worker_demographics.tsv', sep='\t')

In [6]:
aggression_data['comment_clean'] = aggression_data['comment'].str.replace('NEWLINE_TOKEN', ' ')

In [7]:
aggression_annotations = aggression_annotations.merge(aggression_worker_demographics)

## Worker and text feature vectors

In [8]:
aggression_text_features = aggression_data.loc[:, ['year', 'logged_in', 'ns', 'sample']].fillna('empty')

year_onehot = pd.get_dummies(aggression_text_features.year).values
logged_in_onehot = pd.get_dummies(aggression_text_features.logged_in).values
ns_onehot = pd.get_dummies(aggression_text_features.ns).values
sample_onehot = pd.get_dummies(aggression_text_features['sample']).values

text_features = np.hstack([year_onehot, logged_in_onehot, ns_onehot, sample_onehot])

In [9]:
aggression_worker_demographics = aggression_worker_demographics.fillna('empty')

worker_id_onehot = pd.get_dummies(aggression_worker_demographics.worker_id).values
gender_onehot = pd.get_dummies(aggression_worker_demographics.gender).values
english_first_language_onehot = pd.get_dummies(aggression_worker_demographics.english_first_language).values
age_group_onehot = pd.get_dummies(aggression_worker_demographics.age_group).values
education_onehot = pd.get_dummies(aggression_worker_demographics.education).values

annotator_features = np.hstack([gender_onehot, english_first_language_onehot, age_group_onehot, education_onehot])

## Texts tokenization

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=None, oov_token='<OOV>')
tokenizer.fit_on_texts(aggression_data.comment_clean.tolist())

In [11]:
text_tokenized = tokenizer.texts_to_sequences(aggression_data.comment_clean.tolist())
text_lens = [len(t) for t in text_tokenized]

In [12]:
text_tokenized = pad_sequences(text_tokenized, maxlen=256, dtype='int32', padding='post', truncating='post', value=0.0)

## Fasttext load

In [None]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English

In [None]:
ft = fasttext.load_model('cc.en.300.bin')

## Word embeddings

In [15]:
word_embeddings = torch.empty((len(tokenizer.word_index.keys()) + 1, 300))
for w, i in tokenizer.word_index.items():
    word_embeddings[i] = torch.tensor(ft[w])

In [16]:
all_embeddings = torch.empty((len(aggression_data.index), 300))
texts = aggression_data.comment_clean.to_list()
for i in range(len(aggression_data.index)):
    all_embeddings[i] = torch.tensor(ft.get_sentence_vector(texts[i]))

## Net

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

    def __init__(self, classes_num=2, feature_num=300):
        super(Net, self).__init__()
        
        self.feature_num = feature_num
        
        self.embedding = torch.nn.Embedding.from_pretrained(word_embeddings, 
                                            padding_idx=0)
        
        self.hidden_dim = 32
        self.rnn = nn.LSTM(word_embeddings.shape[1], 
                           self.hidden_dim, 
                           num_layers=1, 
                           bidirectional=False, 
                           dropout=0.5, 
                           batch_first=True)
        
        self.fc1 = nn.Linear(self.hidden_dim + feature_num, classes_num)
            
    def forward(self, tokens, features):
        x = self.embedding(tokens)

        lens_X = (tokens != 0).sum(dim=1).to('cpu')
        lens_X[lens_X == 0] = 1
        
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lens_X, batch_first=True, enforce_sorted=False).to(device)
        
        x, (hidden, cell) = self.rnn(x)
        x = torch.cat([hidden.view(-1, self.hidden_dim), features.view(features.size(-2), self.feature_num)], dim=1)
        
        x = self.fc1(x)
        
        return x

## Training

In [18]:
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence
from time import time

class BatchIndexedDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.tensor(y).long()

        self.aggression_text_features = torch.tensor(text_features)#.to(device)
        self.worker_id_onehot = torch.tensor(worker_id_onehot)#.to(device)
        self.annotator_features = torch.tensor(annotator_features)#.to(device)
        
    def __getitem__(self, index):
        revs_X = self.X[index, 0]
        workers_X = self.X[index, 1]
        
        text_tokens = torch.tensor(text_tokenized[revs_X]).long()
        
        batch_X = text_tokens
        batch_features = torch.empty((len(index), 0))
        batch_y = self.y[index]
        
        if CFG['scenario'] == 's2':
          batch_features = torch.cat([self.annotator_features[workers_X], self.aggression_text_features[revs_X]], dim=1)

        elif CFG['scenario'] == 's3':
          batch_features = torch.cat([self.annotator_features[workers_X], self.aggression_text_features[revs_X], self.worker_id_onehot[workers_X]], dim=1)
          #batch_X = torch.cat([batch_X, self.worker_id_onehot[workers_X]], dim=1)

        elif CFG['scenario'] == 's4':
          negative_embeddings = annotator_negative_embeddings[workers_X]#.to(device)
          positive_embeddings = annotator_positive_embeddings[workers_X]#.to(device)
          batch_features = torch.cat([self.annotator_features[workers_X], self.aggression_text_features[revs_X], negative_embeddings, positive_embeddings], dim=1)
            
        return batch_X.to(device), batch_features.to(device), batch_y.to(device)
    
    def __len__(self):
        return len(self.y)

In [19]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, WeightedRandomSampler
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint

def prepare_dataloader(X, y):
  dataset = BatchIndexedDataset(X, y)        
  sampler = data.sampler.BatchSampler(
      data.sampler.RandomSampler(dataset),
      batch_size=CFG['batch_size'],
      drop_last=False)
  
  return data.DataLoader(dataset, sampler=sampler, batch_size=None)

def evaluate(train_X, dev_X, test_X, train_y, dev_y, test_y):
    """ Train classifier """
    train_loader = prepare_dataloader(train_X, train_y)
    val_loader = prepare_dataloader(dev_X, dev_y)
    test_loader = prepare_dataloader(test_X, test_y)

    feature_num = next(iter(val_loader))[1].size(-1)
    model = HateClassifier(2, feature_num=feature_num).to(device)

    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    checkpoint_callback = ModelCheckpoint(
        save_top_k=1,
        monitor='valid_loss',
        mode='min'
    )
    
    trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, max_epochs=CFG['epochs'], progress_bar_refresh_rate=20,
                        profiler="simple", checkpoint_callback=checkpoint_callback)
    trainer.fit(model, train_loader, val_loader)
    
    checkpoint = torch.load(checkpoint_callback.best_model_path, map_location='cpu')
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()
    model = model.to(device)
    
    test_probabs = [] 
    true_labels = []
    with torch.no_grad():
      for batch_text_X, batch_features, batch_text_y in test_loader:
        test_probabs.append(model(batch_text_X, batch_features))
        true_labels.extend(batch_text_y.to(device).flatten().tolist())

    test_probabs = torch.cat(test_probabs, dim=0)
    test_predictions  = test_probabs.argmax(dim=1)

    y_true = np.array(true_labels).flatten()
    y_pred = test_predictions.tolist() 

    print(classification_report(y_true, y_pred))
    result_dict = classification_report(y_true, y_pred, output_dict=True)

    print('Confusion matrix:')
    print(confusion_matrix(y_true, y_pred))

    return result_dict

class HateClassifier(pl.LightningModule):
    def __init__(self, classes_num=2, feature_num=100):
        super().__init__()
        self.model = Net(classes_num=classes_num, feature_num=feature_num).to(device)
        self.train_acc = pl.metrics.Accuracy()
        self.valid_acc = pl.metrics.Accuracy()
        self.train_f1 = pl.metrics.F1(1,average=None)
        self.valid_f1 = pl.metrics.F1(1, average=None)
        self.valid_conf = pl.metrics.ConfusionMatrix(2)

    def forward(self, x, features):
        x = self.model(x, features)
        return x

    def training_step(self, batch, batch_idx):
        x, features, y = batch
        y = y.flatten()
        output = self.forward(x, features)
        loss = nn.CrossEntropyLoss(torch.tensor(CFG['class_weights']).to(device))(output, y)
        self.log('train_loss',  loss, on_epoch=True)
        self.log('train_acc', self.train_acc(output, y), prog_bar=True)
        self.log('train_f1', self.train_f1(output, y), prog_bar=True)

        return loss

    def training_epoch_end(self, outs):
        epoch_acc = self.train_acc.compute()
    
    def validation_step(self, batch, batch_idx):
        x, features, y = batch
        y = y.flatten()
        output = self.forward(x, features)
        loss = nn.CrossEntropyLoss(torch.tensor(CFG['class_weights']).to(device))(output, y)

        self.log('valid_loss', loss)
        self.log('valid_acc', self.valid_acc(output, y), prog_bar=True)
        self.log('valid_f1', self.valid_f1(output, y), prog_bar=True)
        self.log('valid_conf', self.valid_conf(output, y))
        
        return {'loss': loss, 'true_labels': output, 'predictions': y}

    def validation_epoch_end(self, outs):
        val_epoch_acc = self.valid_acc.compute()
        self.valid_f1.compute()
        self.valid_conf.compute()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=CFG['lr'])
        return optimizer

## Personal Embeddings

In [20]:
rev_id_idx_dict = aggression_data.loc[:, ['rev_id']].reset_index().set_index('rev_id').to_dict()['index']
worker_id_idx_dict = aggression_worker_demographics.loc[:, ['worker_id']].reset_index().set_index('worker_id').to_dict()['index']

In [21]:
train_X = aggression_annotations.loc[aggression_annotations.rev_id.isin(aggression_data[aggression_data.split == 'train'].rev_id.values)].loc[:, ['rev_id', 'worker_id']]
dev_X = aggression_annotations.loc[aggression_annotations.rev_id.isin(aggression_data[aggression_data.split == 'dev'].rev_id.values)].loc[:, ['rev_id', 'worker_id']]
test_X = aggression_annotations.loc[aggression_annotations.rev_id.isin(aggression_data[aggression_data.split == 'test'].rev_id.values)].loc[:, ['rev_id', 'worker_id']]

train_y = aggression_annotations.loc[aggression_annotations.rev_id.isin(aggression_data[aggression_data.split == 'train'].rev_id.values)].aggression
dev_y = aggression_annotations.loc[aggression_annotations.rev_id.isin(aggression_data[aggression_data.split == 'dev'].rev_id.values)].aggression
test_y = aggression_annotations.loc[aggression_annotations.rev_id.isin(aggression_data[aggression_data.split == 'test'].rev_id.values)].aggression

for df in [train_X, dev_X, test_X]:
  df['worker_id'] = df['worker_id'].apply(lambda w_id: worker_id_idx_dict[w_id])
  df['rev_id'] = df['rev_id'].apply(lambda r_id: rev_id_idx_dict[r_id])

train_X, dev_X, test_X, train_y, dev_y, test_y = train_X.values, dev_X.values, test_X.values, train_y.values, dev_y.values, test_y.values

In [22]:
train_rev_ids = aggression_data[aggression_data.split == 'train'].rev_id.to_list()

In [23]:
annotator_negative_embeddings = torch.zeros(len(worker_id_idx_dict.keys()), 300)
annotator_positive_embeddings = torch.zeros(len(worker_id_idx_dict.keys()), 300)

worker_annotations = aggression_annotations[aggression_annotations.rev_id.isin(train_rev_ids)].groupby(['worker_id', 'aggression'])['rev_id'].apply(list).to_dict()

In [24]:
for i in worker_id_idx_dict.keys():
  if (i, 0.0) in worker_annotations:
    negative_text_idxs = [rev_id_idx_dict[r_idx] for r_idx in worker_annotations[(i, 0.0)]]
    annotator_negative_embeddings[worker_id_idx_dict[i]] = all_embeddings[negative_text_idxs].mean(axis=0)
  if (i, 1.0) in worker_annotations:
    positive_text_idxs = [rev_id_idx_dict[r_idx] for r_idx in worker_annotations[(i, 1.0)]]
    annotator_positive_embeddings[worker_id_idx_dict[i]] = all_embeddings[positive_text_idxs].mean(axis=0)

## S1

In [25]:
CFG = {
    'lr': 7*1e-4, 
    'epochs': 30,
    'class_weights': [1.0, 1.0],
    'batch_size': 3000,
    'scenario': 's1'
}

In [None]:
results_s1 = {}
for i in range(10):
  results_s1[i] = evaluate(train_X, dev_X, test_X, train_y, dev_y, test_y)

## S2

In [None]:
CFG = {
    'lr': 7*1e-4, 
    'epochs': 30,
    'class_weights': [1.0, 1.0],
    'batch_size': 3000,
    'scenario': 's2'
}

In [None]:
results_s2 = {}
for i in range(10):
  results_s2[i] = evaluate(train_X, dev_X, test_X, train_y, dev_y, test_y)

## S3

In [None]:
CFG = {
    'lr': 7*1e-4, 
    'epochs': 30,
    'class_weights': [1.0, 1.0],
    'batch_size': 3000,
    'scenario': 's3'
}

In [None]:
results_s3 = {}
for i in range(10):
  results_s3[i] = evaluate(train_X, dev_X, test_X, train_y, dev_y, test_y)

## S4

In [27]:
CFG = {
    'lr': 7*1e-4, 
    'epochs': 30,
    'class_weights': [1.0, 1.0],
    'batch_size': 3000,
    'scenario': 's4'
}

In [None]:
results_s4 = {}
for i in range(10):
  results_s4[i] = evaluate(train_X, dev_X, test_X, train_y, dev_y, test_y)

In [None]:
def get_mean_results(results):
  accuracy = np.mean([results[i]['accuracy'] for i in results.keys()])
  precision_macro = np.mean([results[i]['macro avg']['precision'] for i in results.keys()])
  recall_macro = np.mean([results[i]['macro avg']['recall'] for i in results.keys()])
  f1_macro = np.mean([results[i]['macro avg']['f1-score'] for i in results.keys()])
  precision_a = np.mean([results[i]['1']['precision'] for i in results.keys()])
  recall_a = np.mean([results[i]['1']['recall'] for i in results.keys()])
  f1_a = np.mean([results[i]['1']['f1-score'] for i in results.keys()])

  return {'accuracy': accuracy, 
          'precision_macro': precision_macro,
          'recall_macro': recall_macro,
          'f1_macro': f1_macro,
          'precision_a': precision_a,
          'recall_a': recall_a,
          'f1_a': f1_a,
          }

print('S1')
print(get_mean_results(results_s1))

print('S2')
print(get_mean_results(results_s2))

print('S3')
print(get_mean_results(results_s3))

print('S4')
print(get_mean_results(results_s4))

S1
{'accuracy': 0.8758207873944442, 'precision_macro': 0.8344479242787705, 'recall_macro': 0.7282863345523454, 'f1_macro': 0.763803514367855, 'precision_a': 0.7799252955895319, 'recall_a': 0.4893098878212713, 'f1_a': 0.601147586042098}
S2
{'accuracy': 0.8778048946141134, 'precision_macro': 0.8372304028080834, 'recall_macro': 0.7333551082017212, 'f1_macro': 0.7687231011365694, 'precision_a': 0.7835378740841303, 'recall_a': 0.4993752070809864, 'f1_a': 0.6098911747838397}
S3
{'accuracy': 0.892175861990899, 'precision_macro': 0.8512249115066648, 'recall_macro': 0.7767988339160098, 'f1_macro': 0.8059660308716934, 'precision_a': 0.7938998592219725, 'recall_a': 0.5899110143418376, 'f1_a': 0.6766348597055765}
S4
{'accuracy': 0.8847873038871658, 'precision_macro': 0.8398945068894392, 'recall_macro': 0.7596638220651589, 'f1_macro': 0.7900044278595199, 'precision_a': 0.7779806523717221, 'recall_a': 0.5569886874615421, 'f1_a': 0.648927872698743}
