<a href="https://colab.research.google.com/github/Dutta-SD/NLP/blob/master/Aggression_Detection/Experiments/Aggression_English_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aggression Detection in English language model
Here, we develop a BERT based model for aggression detection
in English Language

# Install Dependencies

In [1]:
%%bash
pip install -qq torch
pip install -qq pytorch-lightning
pip install -qq transformers

# Exploration

In [13]:
import pandas as pd
import numpy as np
from sklearn import metrics
import torch
import transformers
import torch.nn as nn
from transformers import (
    AdamW, 
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
import re
import nltk
from imblearn.over_sampling import RandomOverSampler
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
def clean_one_text(text):
    # Cleans one text and returns it
    
    # Clean Punctuation
    # Might remove emojis too
    # 
    res = re.sub(r'[^\w\s]', '', text)
    tk = nltk.TweetTokenizer()

    stopwords = set(nltk.corpus.stopwords.words('english'))

    # stmr = nltk.stem.snowball.SnowballStemmer("english")
    stmr = nltk.stem.porter.PorterStemmer()
    tokens = [token for token in tk.tokenize(res) if token.lower() not in stopwords]
    clean_tokens = [stmr.stem(token) for token in tokens]
    return ' '.join(clean_tokens)


# Returning Clean Dataset
def get_clean_dataset(
    df_raw,
    target_mapping,
    train = True,
    task_name='A', 
    string_cleaner=clean_one_text,
    seed = 0):
    '''
    ===============================================================
    get_clean_dataset - cleans the dataset, returns text and labels
    ===============================================================

    :df_raw - pandas dataframe for cleaning
    :target_mapping - map for the targets
    :train - flag to see if training data sent or not
    :task_name - the target to predict
    :string_cleaner - useful for removing punctuation, etc(function)
    '''

    seed_all()
    #   Shuffle
    df_raw = df_raw.sample(frac=1).reset_index()

    col_str = f'Sub-task {task_name}'

    if 'ID' in df_raw.columns:
        df_raw = df_raw.drop(['ID'], axis = 1)

    targets = df_raw[col_str].map(target_mapping).values
    text = df_raw['Text'].values.astype('str')

    if string_cleaner is not None:
        v_cleaner = np.vectorize(string_cleaner)
        text = v_cleaner(text)

    # Random Oversampling
    ovs = RandomOverSampler(sampling_strategy='minority')
    text, targets = ovs.fit_resample(text.reshape(-1, 1), targets) 

    return text.reshape(-1), targets

In [15]:
# URLS
TRAIN_URL_TASK_1 = 'https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/Aug_Data_Aggression/TASK_A_train_aug_english.csv'
TRAIN_URL_TASK_2 = 'https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/Aug_Data_Aggression/TASK_B_train_aug_english.csv'

VAL_URL = 'https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/trac2_eng_dev.csv'

In [16]:
train = pd.read_csv(TRAIN_URL_TASK_1)
val = pd.read_csv(VAL_URL)
train.head()

Unnamed: 0,Text,Sub-task A
0,Next part,NAG
1,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG
2,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG
3,What the fuck was this? I respect shwetabh and...,NAG
4,Concerned authorities should bring arundathi R...,NAG


In [17]:
def seed_all():
  np.random.seed(0)
  torch.manual_seed(0)

seed_all()

In [18]:
class BERTDataset(Dataset):
    def __init__(self, texts, targets, max_len = 128):
        self.texts = texts
        self.targets = targets
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-multilingual-uncased",
            do_lower_case = True
        )
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = "max_length",
            truncation = True                    
        )
        resp = {
            "ids" : torch.tensor(inputs['input_ids'], dtype=torch.long),
            "mask" : torch.tensor(inputs['attention_mask'], dtype=torch.long),
            "token_type_ids" : torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            "targets" : torch.tensor(self.targets[idx], dtype=torch.long)
        }
        return resp

In [21]:
class TextModel(pl.LightningModule):
    def __init__(self, 
                 num_classes, 
                 num_train_steps, 
                 target_mapping,
                 task_name,
        ):
        super().__init__()
        self.bert = transformers.BertModel.from_pretrained(
            "bert-base-multilingual-uncased", return_dict=False
        )        
        self.bert_norm = nn.BatchNorm1d(num_features = 768)
        self.out = nn.Linear(768, num_classes)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"
        self._target_mapping = target_mapping
        self._task_name = task_name
        
    def forward(self, ids, mask, token_type_ids, targets=None):
        _, x = self.bert(ids, attention_mask = mask, token_type_ids=token_type_ids)
        x = self.bert_norm(x)
        x = self.out(x)
#       Calculate the loss
        if targets is not None:
            loss = self.loss(x, targets)
            met = self.monitor_metrics(x, targets)
            return x, loss, met
        return x, None, {}        
    
    def configure_optimizers(self):
        opt = AdamW(self.parameters(), lr=3e-5)
        sch = get_cosine_schedule_with_warmup(
            opt,
            num_warmup_steps=0,
            num_training_steps=self.num_train_steps
        )
        return [opt], [sch]        
    
    def loss(self, outputs, targets):
      # Multiclass
        l = nn.CrossEntropyLoss()
        l = l(outputs, targets)
        return l

    def monitor_metrics(self, outputs, targets):
        outputs = torch.argmax(outputs, axis = 1).cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        return {
            "f1" : metrics.f1_score(targets, outputs, average='macro'),
            "accuracy" : metrics.accuracy_score(targets, outputs)
        }

    def training_step(self, batch, batch_idx):
      targets = batch["targets"]
      input_ids = batch["ids"]
      attention_mask = batch["mask"]
      token_type_ids = batch["token_type_ids"]

      x, loss, m = self.forward(
              input_ids,
              token_type_ids=token_type_ids,
              mask=attention_mask,
              targets=targets
              )
      
      if not batch_idx:
          print("-" * 40)
          print(f"TRAINING : {self.current_epoch}")
          print(m)
      
      return loss
    
    def train_dataloader(self):
      train_text, train_target = get_clean_dataset(
          df_raw = train,
          target_mapping = self._target_mapping,
          train=True,
          task_name=self._task_name,
      )

      x = BERTDataset(
          train_text,
          train_target,
        #   max_len = 1024
      )
      return DataLoader(
          x,
          batch_size = 16,
          shuffle = True,
          pin_memory = True,
          num_workers = 2
      )

    def validation_step(self, batch, batch_idx):
      targets = batch["targets"]
      input_ids = batch["ids"]
      attention_mask = batch["mask"]
      token_type_ids = batch["token_type_ids"]

      x, loss, m = self.forward(
              input_ids,
              token_type_ids=token_type_ids,
              mask=attention_mask,
              targets=targets
              )
      
      if not batch_idx:
        print("-" * 40)
        print(f"VALIDATION : {self.current_epoch}")
        print(m)
      
      return {'val_loss' : loss.detach()}
    
    def val_dataloader(self):
      val_text, val_target = get_clean_dataset(
          df_raw = val,
          target_mapping = self._target_mapping,
          train=True,
          task_name=self._task_name,
      )
      x = BERTDataset(
          val_text,
          val_target,
        #   max_len = 1024
      )
      return DataLoader(
          x,
          batch_size = 16,
        #   shuffle = True,
          pin_memory = True,
          num_workers = 2
      )

In [20]:
## Map the output
task_1_map ={
    'NAG' : 0,
    'CAG' : 1,
    'OAG' : 2
}

## Define some training parameters
n_train_steps = int(len(train) / 32 * 10)

## Define model, callbacks
model = TextModel(
    num_classes=3, 
    num_train_steps=n_train_steps, 
    target_mapping = task_1_map, 
    task_name = 'A')

trainer = pl.Trainer(
    gpus=[0], 
    max_epochs=3,
    precision=16,
    accumulate_grad_batches = 8  
    )

# Fit
trainer.fit(model)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOC

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



----------------------------------------
VALIDATION : 0
{'f1': 0.3043478260869565, 'accuracy': 0.4375}




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

----------------------------------------
TRAINING : 0
{'f1': 0.49300699300699297, 'accuracy': 0.5}


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

----------------------------------------
VALIDATION : 0
{'f1': 0.3055555555555556, 'accuracy': 0.5}
----------------------------------------
TRAINING : 1
{'f1': 0.7186147186147186, 'accuracy': 0.75}


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

----------------------------------------
VALIDATION : 1
{'f1': 0.4876543209876543, 'accuracy': 0.875}
----------------------------------------
TRAINING : 2
{'f1': 0.9440559440559441, 'accuracy': 0.9375}


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

----------------------------------------
VALIDATION : 2
{'f1': 0.44102564102564107, 'accuracy': 0.8125}

