In [1]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
!pip install transformers[sentencepiece]

# Custom Data Loader

In [2]:
from typing import List

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import PreTrainedTokenizer

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import numpy as np

import spacy


class EncodedDataset(Dataset):

    def __init__(self, input_sents: List[str],
                input_labels: List[int],
                input_modifers:List[List],
                tokenizer: PreTrainedTokenizer,
                max_sequence_length: int = None,
                max_targets: int = 5):

        self.input_sents = input_sents
        self.input_labels = input_labels
        self.input_modifers = input_modifers
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length
        self.max_targets = max_targets
        # self.min_sequence_length = min_sequence_length

    def __len__(self):
        return len(self.input_sents)

    def __getitem__(self, index):

        text = self.input_sents[index]
        modifers = self.input_modifers[index]
        label = self.input_labels[index]

        # If we are doing some preprocessing
        # preprocessor = PreProcess()

        # senti_token = self.senti_tokenizer(text, padding='max_length', max_length= self.max_sequence_length, truncation=True)

        # senti_input_ids, senti_mask_ids = torch.tensor(senti_token['input_ids']), torch.tensor(senti_token['attention_mask'])

        token = self.tokenizer(text, padding='max_length', max_length= self.max_sequence_length, truncation=True)

        input_ids, mask_ids = torch.tensor(token['input_ids']), torch.tensor(token['attention_mask'])

        return input_ids, mask_ids, modifers, label

[nltk_data] Downloading package punkt to
[nltk_data]     /home/local/ASUAD/abhatt43/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/local/ASUAD/abhatt43/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Custom RoBERTa Model

In [3]:
import torch.nn as nn
from torch.nn import Softmax, CrossEntropyLoss, MSELoss

from transformers import RobertaForSequenceClassification, RobertaTokenizer

from transformers.modeling_outputs import SequenceClassifierOutput

from collections import namedtuple

class FeatureSwitchHead(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.max_sequence_length*config.num_features, config.max_sequence_length*config.num_features)
        self.dropout = nn.Dropout(config.classifier_dropout)
        self.out_proj = nn.Linear(config.max_sequence_length*config.num_features, config.max_sequence_length)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x

class RobertaHateClassificationHead(torch.nn.Module):

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class RobertaForHateClassification(RobertaForSequenceClassification):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config, feature_config):
        super().__init__(config)

        self.soft_max = Softmax(dim=1)
        self.switch_layer = FeatureSwitchHead(feature_config)
        self.cls_layer = RobertaHateClassificationHead(feature_config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        aux_attention=None,
        class_weights=None
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


        switch_attn = self.switch_layer(aux_attention)

        sequence_output = outputs[0]*switch_attn.unsqueeze(2)

        logits = self.cls_layer(sequence_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss(weight=class_weights)
                loss_fct = loss_fct.to(device)
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        softmax_logits = self.soft_max(logits)

        if not return_dict:
            output = (softmax_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=softmax_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [4]:
from torch.optim import Adam, lr_scheduler
from tqdm import tqdm
import os
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

from torch.utils.data import DataLoader, RandomSampler

def train(model, train_data, train_labels, train_modifers, val_data, val_labels, val_modifers,
          senti_model, aggre_model, tokenizer, params):
    accumulation_steps = 4

    train = EncodedDataset(input_sents=train_data,
                    input_labels=train_labels,
                    input_modifers=train_modifers,
                    tokenizer=tokenizer,
                    max_sequence_length=params.max_sequence_length)

    val =  EncodedDataset(input_sents=val_data,
                    input_labels=val_labels,
                    input_modifers=val_modifers,
                    tokenizer=tokenizer,
                    max_sequence_length=params.max_sequence_length)

    train_dataloader = DataLoader(train, batch_size=params.train_batch_size, shuffle=True)

    val_dataloader = DataLoader(val, batch_size=params.val_batch_size)

    optimizer = Adam(model.parameters(), lr=params.learning_rate)

    earlystop_epochs = 3 # 3 consecutive epochs without validation acc increase

    save_dir = "/home/local/ASUAD/abhatt43/Projects/HateSpeech/notebooks/"

    best_validation_accuracy = 1e-5
    best_validation_accuracy1 = 1e-5
    without_progress = 0
    model.zero_grad()
    for epoch_num in range(params.epochs):

        total_acc_train = 0
        total_loss_train = 0
        predictions = []
        y_true = []
        c=0

        with tqdm(train_dataloader, desc="Training") as loop:

            for train_input, train_mask, train_modifers, train_label in loop:
                model.train()
                c+=1
                train_input = train_input.to(device)
                train_mask = train_mask.to(device)
                train_label = train_label.to(device)

                aggr_output = aggre_model(input_ids=train_input, attention_mask=train_mask, output_hidden_states = True, output_attentions=True)

                senti_output = senti_model(input_ids=train_input, attention_mask=train_mask, output_hidden_states = True, output_attentions=True)

                # Potential Update: Consider other layer heads (Sum/mean or specific) instead last one (-1)

                aggr_att_scores = torch.sum(aggr_output.attentions[-1][0:,-1],axis=1)

                senti_att_scores = torch.sum(senti_output.attentions[-1][0:,-1],axis=1)

                attn_mat = torch.cat((aggr_att_scores,senti_att_scores), 1)

                output = model(input_ids=train_input,
                              attention_mask=train_mask,
                              labels=train_label,
                              aux_attention=attn_mat,class_weights=params.class_weights)

                loss, logits = output["loss"], output["logits"]
                l2_reg = torch.tensor(0., requires_grad=True)
                for param in model.parameters():
                    l2_reg = l2_reg + torch.norm(param, 2)
                loss = loss + (l2_reg * 0.001)
                loss.backward()
                if (c + 1) % accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

                total_loss_train += loss.item()
                acc = (logits.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                loop.set_postfix(loss=loss.item(), acc=acc/len(train_input))

        total_acc_val = 0
        total_loss_val = 0

        model.eval()
        with torch.no_grad():
            for val_input, val_mask, val_modifers, val_label in val_dataloader:

                val_input = val_input.to(device)
                val_mask = val_mask.to(device)

                val_label = val_label.to(device)

                aggr_output = aggre_model(input_ids=val_input, attention_mask=val_mask, output_hidden_states = True, output_attentions=True)

                senti_output = senti_model(input_ids=val_input, attention_mask=val_mask, output_hidden_states = True, output_attentions=True)

                aggr_att_scores = torch.sum(aggr_output.attentions[-1][0:,-1],axis=1)

                senti_att_scores = torch.sum(senti_output.attentions[-1][0:,-1],axis=1)

                attn_mat = torch.cat((aggr_att_scores,senti_att_scores), 1)

                output = model(input_ids=val_input,
                              attention_mask=val_mask,
                              labels=val_label,
                              aux_attention=attn_mat)

                loss, logits = output["loss"], output["logits"]

                acc = (logits.argmax(dim=1) == val_label).sum().item()

                total_acc_val += acc

                predictions.extend(logits.argmax(dim=1).detach().cpu().numpy())

                y_true.extend(val_label.detach().cpu().numpy())

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')
        print("CLassification Report: ", classification_report(y_true,predictions))

        ## Early Stopping Criteria
        temp = classification_report(y_true, predictions,output_dict=True)
        macro = pd.DataFrame(temp)['1']['f1-score']
        m1 = pd.DataFrame(temp)['macro avg']['f1-score']
        if macro >= best_validation_accuracy or m1>=best_validation_accuracy1:

            without_progress = 0

            if(macro>=best_validation_accuracy):
                best_validation_accuracy = macro

            if(m1>=best_validation_accuracy1):
                best_validation_accuracy1 = m1

            model_to_save = model

            fname = "best-model_" + params.dataset_name+"_"+str(epoch_num+1)+".pt"
          # torch.save(model_to_save.state_dict(), os.path.join(save_dir, fname))
          # print("Saved at ",os.path.join(save_dir, fname))

        else:

            without_progress +=1

        if without_progress >= earlystop_epochs:

            print("Early stopping.....")

            print("Saving model: ", fname)

            break

In [5]:
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from types import SimpleNamespace
from sklearn.utils.class_weight import compute_class_weight

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)

<torch._C.Generator at 0x7fa7a68c4f48>

In [7]:
file_path = '/home/local/ASUAD/abhatt43/Projects/HateSpeech/data/Rationales_file_REDDIT_dataset.csv'
df = pd.read_csv(file_path)
train_df = df[df['exp_split'] == 'train']
test_df = df[df['exp_split'] == 'test']

In [8]:
print("Train df: ", len(train_df))
print("Test_df: ", len(test_df))

Train df:  29731
Test_df:  7433


In [9]:
import gc
# del variables
gc.collect()

22

In [10]:
aggr_task='offensive'
aggr_MODEL = f"cardiffnlp/twitter-roberta-base-{aggr_task}"

aggr_model = AutoModelForSequenceClassification.from_pretrained(aggr_MODEL).to(device)

latest_task='sentiment-latest'
sentiment_MODEL = f"cardiffnlp/twitter-roberta-base-{latest_task}"#This is a roBERTa-base model trained on ~58M
senti_model = AutoModelForSequenceClassification.from_pretrained(sentiment_MODEL).to(device)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
from sklearn.model_selection import train_test_split

train_text, val_texts, train_labels, val_labels = train_test_split(train_df['text'].tolist(),train_df['label'].tolist(), test_size = 0.2, random_state = 42)
# splitting train_df into train and validation

In [15]:
# train_frame = train_df
test_frame = test_df

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = torch.FloatTensor(class_weights)
params = {"max_sequence_length": 512,
"learning_rate" : 2e-5,
"train_batch_size" : 2,
"val_batch_size" : 2,
"epochs" : 3,
"device" : device,
"dataset_name" : "Reddit",
"class_weights" : class_weights,
"hidden_size" : 768,
"num_features" : 2,
"num_labels": 2,
"classifier_dropout" : 0.2
}
params = SimpleNamespace(**params)

model_name = "roberta-base"

base_model = RobertaForHateClassification.from_pretrained(model_name, feature_config=params).to(device)
tokenizer = RobertaTokenizer.from_pretrained(model_name)
train(model=base_model,
# train_data=train_frame['text'].values.tolist(),
# train_labels=train_frame['label'].values.tolist(),
train_data=train_text,
train_labels=train_labels,
train_modifers=np.zeros(len(train_text)).tolist(),
# val_data=test_frame['text'].values.tolist(),
# val_labels=test_frame['label'].values.tolist(),
val_data=val_texts,
val_labels=val_labels,
val_modifers=np.zeros(len(test_frame)).tolist(),
tokenizer=tokenizer,
senti_model = senti_model,
aggre_model = aggr_model,
params=params)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForHateClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForHateClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForHateClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForHateClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

In [None]:
def evaluate(model, test_data, test_labels,
          senti_tokenizer, aggre_tokenizer, test_modifiers, max_sequence_length, learning_rate, test_batch_size, device):

    test = EncodedDataset(input_sents=test_data,
                    input_labels=test_labels,
                    input_modifers=test_modifiers,
                    tokenizer=tokenizer,
                    max_sequence_length=max_sequence_length)


    test_dataloader = DataLoader(test, batch_size=test_batch_size)


    total_acc_test = 0
    total_loss_test = 0
    predictions = []
    y_true = []
    model.eval()
    with torch.no_grad():
        for test_input, test_mask, test_modifiers, test_label in test_dataloader:
        test_input = test_input.to(device)
        test_mask = test_mask.to(device)

        # val_modifers = val_modifers.to(device)
        test_label = test_label.to(device)

        aggr_output = aggr_model(input_ids=test_input, attention_mask=test_mask, output_hidden_states = True, output_attentions=True)

        senti_output = senti_model(input_ids=test_input, attention_mask=test_mask, output_hidden_states = True, output_attentions=True)

        aggr_att_scores = torch.sum(aggr_output.attentions[-1][0:,-1],axis=1)

        senti_att_scores = torch.sum(senti_output.attentions[-1][0:,-1],axis=1)

        attn_mat = torch.cat((aggr_att_scores,senti_att_scores), 1)

        output = model(input_ids=test_input,
                      attention_mask=test_mask,
                      labels=test_label,
                      aux_attention=attn_mat)

        loss, logits = output["loss"], output["logits"]

        acc = (logits.argmax(dim=1) == test_label).sum().item()

        predictions.extend(logits.argmax(dim=1).detach().cpu().numpy())

        y_true.extend(test_label.detach().cpu().numpy())
    return predictions,y_true

In [None]:
preds,trues = evaluate(model=base_model, test_data = test_frame['text'].values.tolist(), test_labels=test_frame['label'].values.tolist(),
senti_tokenizer=senti_model, aggre_tokenizer=aggr_model,test_modifiers=np.zeros(len(test_frame)), max_sequence_length=512, learning_rate=1e-5, test_batch_size=2, device=device)
# print(f1)
print("*"*100)
print(classification_report(trues,preds))