In [None]:
'''
This is the main file used to run EchoKG and KnowledJe experiments.
'''

In [None]:
import logging
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
logger = logging.getLogger(__name__)
!pip install transformers
import transformers
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
np.random.seed(0)
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, auc, roc_curve, balanced_accuracy_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [None]:
max_seq_len = 128

In [None]:
import re
# REGEXES
URL_RE = re.compile(
    r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})')
RT_RE = re.compile(r'RT\s')
PUNCT_RE = re.compile('[“",?;:!\\-\[\]_.%/\n]')
MENTION_RE = re.compile(r'@\w+')

In [None]:
# tokenizer to preprocess posts before feeding into DistilBERT
bert_tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
# all preprocessing from https://github.com/NasLabBgu/hate_speech_detection
def bert_preprocessing(X, training):
    if training:
        token_1_to_add = 'rt'
        token_2_to_add = '((('
        token_3_to_add = ')))'
        token_4_to_add = '(((('
        token_5_to_add = '))))'
        bert_tokenizer.add_tokens([token_1_to_add, token_2_to_add, token_3_to_add, token_4_to_add, token_5_to_add])
        if hasattr(bert_tokenizer, 'vocab'):
            vocab_size = len(bert_tokenizer.vocab)
    input_dict = bert_tokenizer.batch_encode_plus(X.tolist(), add_special_tokens=True, max_length=max_seq_len,
                                    truncation_strategy='longest_first', pad_to_max_length=True,
                                    return_attention_mask=True, return_token_type_ids=False)

    input_ids = input_dict['input_ids']

    attention_masks = input_dict['attention_mask']
    X = [np.asarray(input_ids, dtype='int32'), np.asarray(attention_masks, dtype='int32')]
    return X

In [None]:
def initial_preprocessing(X):
        X = X.apply(lambda text: URL_RE.sub('', text))  # Remove URLs
        X = X.apply(lambda text: RT_RE.sub('', text))  # Remove RT from tweets
        X = X.apply(lambda text: text.replace("…", " "))
        X = X.apply(lambda text: text.strip())
        return X

In [None]:
def full_preprocessing(X, y, mode):
        logger.info(f"Preprocessing text in mode: {mode}")
        X = initial_preprocessing(X)
        # split to train and test
        if mode == 'split':
            X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
            X_train_as_text = X_train.copy()
            X_test_as_text = X_test.copy()
            X_train = bert_preprocessing(X_train, training=True)
            X_test = bert_preprocessing(X_test, training=False)

        # only train
        elif mode == 'train':
            X_train_as_text = X.copy()
            X_test_as_text = None
            X_train = bert_preprocessing(X, training=True)
            y_train = y.copy()
            X_test = None
            y_test = None

        # only test
        elif mode == 'test':
            X_train_as_text = None
            X_test_as_text = X.copy()
            X_train = None
            y_train = None
            y_test = None
            X_test = bert_preprocessing(X, training=False)
        else:
            raise ValueError(f"mode not supported: {mode}. try `split`, `train`, or `test`")
        return X_train, X_test, y_train, y_test, X_train_as_text, X_test_as_text

In [None]:
# change data paths to your local path
# data parameters for kg-augmented data
kg_data_conf = {
        "data_path": "../../data/echo_kg_sep.tsv", 
        "text_column": "text",
        "label_column": "label",
        "labels": [0, 1],
        "labels_interpretation": ["neutral-responsive", "hate speech"]
    }
# data parameters for baseline data
base_data_conf = {
        "data_path": "../../data/echo_kg_sep.tsv", 
        "text_column": "text",
        "label_column": "label",
        "labels": [0, 1],
        "labels_interpretation": ["neutral-responsive", "hate speech"]
}

# read and clean data
kg_df = pd.read_csv(kg_data_conf["data_path"], sep="\t")
kg_df = kg_df[[kg_data_conf["text_column"], kg_data_conf["label_column"]]]
kg_df = kg_df.dropna().reset_index(drop=True)

kg_X = kg_df[kg_data_conf["text_column"]]
kg_y = kg_df[kg_data_conf["label_column"]]

base_df = pd.read_csv(base_data_conf["data_path"], sep="\t")
base_df = base_df[[base_data_conf["text_column"], base_data_conf["label_column"]]]
base_df = base_df.dropna().reset_index(drop=True)

base_X = base_df[base_data_conf["text_column"]]
base_y = base_df[base_data_conf["label_column"]]

In [None]:
# generate training and testing data, preprocess it
X_train, X_test_g, y_train, y_test_g, X_train_as_text, X_test_as_text_g = full_preprocessing(base_X, base_y, mode='split')
X_train_g, X_test, y_train_g, y_test, X_train_as_text_g, X_test_as_text = full_preprocessing(kg_X, kg_y, mode='split')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [None]:
import os
import importlib

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoModel

In [None]:
import torch
device = torch.device("cuda")
torch.manual_seed(4)

<torch._C.Generator at 0x7fc56be5d5b0>

In [None]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from typing import Tuple, List
from torch import Tensor

# create dataset of tweets
# credit for template: CS287, fall 2021 at Harvard University with Chris Tanner
class TweetsDataset(Dataset):
    
    def __init__(self, X, y): 
        self.labels = y
        self.X = X[0]
    def __len__(self) -> int:
        return len(self.labels) 
    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        x_item = self.X[idx]
        y_item = self.labels.iloc[idx]
        return torch.tensor(x_item), torch.tensor(y_item)

train_ds = TweetsDataset(X_train, y_train)
test_ds = TweetsDataset(X_test, y_test)
def pad_collate_classifier(batch):
    (xx, yy) = zip(*batch)
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=bert_tokenizer.pad_token_id)
    yy_stack = torch.stack(yy, dim=0)
    return xx_pad, yy_stack

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True, collate_fn=pad_collate_classifier)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False, drop_last=True, collate_fn=pad_collate_classifier)
final_dl = DataLoader(test_ds, batch_size=len(test_ds), shuffle = False, drop_last = False, collate_fn = pad_collate_classifier)

3704
926
3704
115
28
(tensor([[  101,   102,  1030,  ...,     0,     0,     0],
        [  101,   102, 30522,  ...,     0,     0,     0],
        [  101,  3058,  1024,  ...,  2582, 13941,   102],
        ...,
        [  101,   102, 30522,  ...,     0,     0,     0],
        [  101,   102,  1030,  ...,     0,     0,     0],
        [  101,   102,  3383,  ...,     0,     0,     0]], dtype=torch.int32), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]))


In [None]:
from typing import Optional
from torch import nn, Tensor
# create classifier
# template credits to CS287, fall 2021 at Harvard University, Chris Tanner
class BERTClassifier(torch.nn.Module):
    
    def __init__(self, base_model_name: str, hidden_size: int, output_size: int, dropout: Optional[float] = None):
        super().__init__()  
        self.base_model_name = base_model_name
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        self.bert_layer = AutoModel.from_pretrained(self.base_model_name).to("cuda")
        self.bert_layer.resize_token_embeddings(len(bert_tokenizer)) 
        self.linear_layer = nn.Linear(in_features = self.hidden_size, out_features = self.output_size).to("cuda")
        self.linear_layer_2 = nn.Linear(in_features = 128, out_features = 2).to("cuda")
        self.dropout_layer = nn.Dropout(0.3).to("cuda")

    def forward(self, data: Tensor) -> Tensor:
        berted = self.bert_layer(data)
        berted = self.dropout_layer(berted["last_hidden_state"])
        lin = self.linear_layer(berted).squeeze()
        lin2 = self.linear_layer_2(lin)
        return lin2

In [None]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import matplotlib.pyplot as plt

# create and train model
# template from CS287
model = BERTClassifier(base_model_name = "distilbert-base-uncased", 
                       hidden_size = 768, 
                       output_size = 1).to("cuda")
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr = 0.00002)
num_epochs = 20
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []

# https://stackoverflow.com/questions/57590697/pytorch-log-softmax-lastdim-kernel-impl-not-implemented-for-torch-longtensor
# copied large chunks of training loop from tutorial given in HW1 of CS287
for epoch in range(num_epochs):
    numtrainbatches = 0
    numtestbatches = 0
    running_loss = 0.0
    running_accuracy = 0.0
    running_test_loss = 0.0
    running_test_accuracy = 0.0
    model.train()
    for i, data in enumerate(train_dl, 0):
        inputs, labels = data
        inputs = inputs.to("cuda")
        labels = labels.to("cuda")
        # zero the parameter gradients
        optimizer.zero_grad()
        h = model(inputs).squeeze()
        lbl = labels.long()
        loss = criterion(h, lbl)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pred = torch.argmax(h, dim = 1)
        correct = pred == labels
        accuracy = sum(correct) / len(labels)
        running_accuracy += accuracy
        numtrainbatches += 1
    with torch.no_grad():
      model.eval()
      for i, data in enumerate(test_dl, 0):
          inputs, labels = data
          inputs = inputs.to("cuda")
          labels = labels.to("cuda")
          h = model(inputs)
          prediction = torch.argmax(h, dim = 1)
          loss = criterion(h.squeeze(), labels.long())
          accuracy = sum(prediction == labels) / len(labels)
          running_test_loss += loss.item()
          running_test_accuracy += accuracy
          numtestbatches += 1
    trainacc = running_accuracy / numtrainbatches
    testacc = running_test_accuracy / numtestbatches
    trainloss = running_loss / numtrainbatches
    testloss = running_test_loss / numtestbatches
    print('Epoch %d/%d:' % (epoch + 1, num_epochs))
    print('\ttraining loss\t%.3f' % (trainloss))
    print('\ttraining accuracy\t%.3f' % (trainacc))
    print('\ttesting loss\t%.3f' % (testloss))
    print('\ttesting accuracy\t%.3f' % (testacc))
    train_losses.append(trainloss)
    test_losses.append(testloss)
    train_accuracies.append(trainacc)
    test_accuracies.append(testacc)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/20:
	training loss	0.295
	training accuracy	0.904
	testing loss	0.225
	testing accuracy	0.920
Epoch 2/20:
	training loss	0.209
	training accuracy	0.922
	testing loss	0.204
	testing accuracy	0.922
Epoch 3/20:
	training loss	0.157
	training accuracy	0.947
	testing loss	0.161
	testing accuracy	0.952
Epoch 4/20:
	training loss	0.100
	training accuracy	0.967
	testing loss	0.163
	testing accuracy	0.935
Epoch 5/20:
	training loss	0.072
	training accuracy	0.974
	testing loss	0.186
	testing accuracy	0.954
Epoch 6/20:
	training loss	0.039
	training accuracy	0.986
	testing loss	0.196
	testing accuracy	0.952
Epoch 7/20:
	training loss	0.022
	training accuracy	0.993
	testing loss	0.188
	testing accuracy	0.951
Epoch 8/20:
	training loss	0.019
	training accuracy	0.992
	testing loss	0.191
	testing accuracy	0.950
Epoch 9/20:
	training loss	0.016
	training accuracy	0.993
	testing loss	0.272
	testing accuracy	0.951
Epoch 10/20:
	training loss	0.010
	training accuracy	0.996
	testing loss	0.279
	te

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, auc, roc_curve, balanced_accuracy_score
import csv
from torch.nn.functional import softmax
# evaluate model, template from CS287
with torch.no_grad():
      model.eval()
      errors = [["X_test", "y_true", "y_pred"]]
      for i, data in enumerate(final_dl, 0):
          inputs, labels = data
          inputs = inputs.to("cuda")
          labels = labels.to("cuda")
          h = model(inputs)
          prediction = torch.argmax(h, dim = 1).cpu()
          labels = labels.cpu()
          accuracy = accuracy_score(labels, prediction)
          precision = precision_score(labels, prediction)
          recall = recall_score(labels, prediction)
          f1 = f1_score(labels, prediction)
          balanced_accuracy = balanced_accuracy_score(labels, prediction)
          y_score = softmax(h, dim = 1)[:,1]
          y_score = y_score.cpu()
          fp, tp, question_mark = roc_curve(labels, y_score)
          aucroc = auc(fp, tp)
          print("accuracy:", accuracy)
          print("precision:", precision)
          print("recall:", recall)
          print("f1 score:", f1)
          print("balanced accuracy:", balanced_accuracy)
          print("aucroc:", aucroc)
          print(len(inputs))
          print(len(X_test))
          for j in range(len(inputs)):
              errors.append([X_test_as_text.iloc[j], labels[j].item(), prediction[j].item()])
      with open("echo_2_20_errors.tsv", "w") as o: # change outfile as desired
          writer = csv.writer(o, delimiter = "\t")
          writer.writerows(errors)
          

torch.Size([926, 128])
torch.Size([926, 2])
torch.Size([926])
accuracy: 0.949244060475162
precision: 0.7301587301587301
recall: 0.6052631578947368
f1 score: 0.6618705035971223
balanced accuracy: 0.7926315789473684
aucroc: 0.903297213622291
926
2
