<a href="https://colab.research.google.com/github/Dutta-SD/NLP/blob/master/Aggression_English_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aggression Detection in English language model

In [1]:
!pip install tez -qq
!pip install transformers -qq

[K     |████████████████████████████████| 1.9MB 18.2MB/s 
[K     |████████████████████████████████| 3.2MB 47.6MB/s 
[K     |████████████████████████████████| 890kB 46.3MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [40]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import confusion_matrix, cohen_kappa_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
import torch
import tez
import transformers
import torch.nn as nn
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn import metrics
import pandas as pd

In [3]:
class BERTDataset():
    def __init__(self, texts, targets, max_len  = 64):
        self.texts = texts
        self.targets = targets
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased",
            do_lower_case = False
        )
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = "max_length",
            truncation = True                    
        )
        resp = {
            "ids" : torch.tensor(inputs['input_ids'], dtype=torch.long),
            "mask" : torch.tensor(inputs['attention_mask'], dtype=torch.long),
            "token_type_ids" : torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            "targets" : torch.tensor(self.targets[idx], dtype=torch.long)
        }
        return resp


class TextModel(tez.Model):
    def __init__(self, num_classes, num_train_steps):
        super().__init__()
        self.bert = transformers.BertModel.from_pretrained(
            "bert-base-uncased", return_dict=False
        )        
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, num_classes)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"
        
    def forward(self, ids, mask, token_type_ids, targets=None):
        _, x = self.bert(ids, attention_mask = mask, token_type_ids=token_type_ids)
        x = self.bert_drop(x)
        x = self.out(x)
#       Calculate the loss
        if targets is not None:
            loss = self.loss(x, targets)
            met = self.monitor_metrics(x, targets)
            return x, loss, met
        return x, None, {}        
    
    def fetch_optimizer(self):
        opt = AdamW(self.parameters(), lr=1e-4)
        return opt
    
    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=self.num_train_steps
        )
        return sch
    
    def loss(self, outputs, targets):
      # Multiclass
        l = nn.CrossEntropyLoss()
        l = l(outputs, targets)
        return l

    def monitor_metrics(self, outputs, targets):
        outputs = torch.argmax(outputs, axis = 1).cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        return {
            "accuracy" : metrics.accuracy_score(targets, outputs)
        }

In [42]:
def get_clean_dataset(df_raw, task_name='A', target_mapping = None):
  # task_name - A or B
  assert target_mapping is not None, "NO TARGET MAPPING FOUND"

  col_str = f'Sub-task {task_name}'
  df_raw.drop(['ID'], axis = 1, inplace = True) if 'ID' in df_raw.columns else None #Drop the id column

  targets = df_raw[col_str].map(target_mapping).values
  text = df_raw['Text'].values

  return text, targets

In [5]:
train = pd.read_csv('https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/trac2_eng_train.csv')
val = pd.read_csv('https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/trac2_eng_dev.csv')
train.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C45.451,Next part,NAG,NGEN
1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN
2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN
3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN
4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN


In [6]:
task_1_map ={
    'NAG' : 0,
    'CAG' : 1,
    'OAG' : 2
}

train_dataset = BERTDataset(*get_clean_dataset(train, 'A', task_1_map))
valid_dataset = BERTDataset(*get_clean_dataset(val, 'A', task_1_map))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [8]:
n_train_steps = int(len(train) / 32 * 10)
model = TextModel(num_classes=3, num_train_steps=n_train_steps)
es = tez.callbacks.EarlyStopping(monitor="valid_loss", patience=3, model_path="model.bin")

model.fit(train_dataset,
          valid_dataset=valid_dataset,
          device="cuda",
          epochs = 10,
          train_bs = 32,
         callbacks=[es]
         )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




100%|██████████| 134/134 [00:45<00:00,  2.91it/s, accuracy=0.786, loss=0.588, stage=train]
100%|██████████| 67/67 [00:04<00:00, 14.60it/s, accuracy=0.788, loss=0.596, stage=valid]


Validation score improved (inf --> 0.595623293474539). Saving model!


100%|██████████| 134/134 [00:48<00:00,  2.75it/s, accuracy=0.808, loss=0.474, stage=train]
100%|██████████| 67/67 [00:04<00:00, 13.48it/s, accuracy=0.725, loss=0.595, stage=valid]
  0%|          | 0/134 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 3


100%|██████████| 134/134 [00:49<00:00,  2.71it/s, accuracy=0.863, loss=0.358, stage=train]
100%|██████████| 67/67 [00:04<00:00, 13.98it/s, accuracy=0.769, loss=0.589, stage=valid]


Validation score improved (0.595623293474539 --> 0.5891136315776341). Saving model!


100%|██████████| 134/134 [00:49<00:00,  2.69it/s, accuracy=0.929, loss=0.212, stage=train]
100%|██████████| 67/67 [00:04<00:00, 13.98it/s, accuracy=0.766, loss=0.756, stage=valid]
  0%|          | 0/134 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 3


100%|██████████| 134/134 [00:49<00:00,  2.71it/s, accuracy=0.962, loss=0.118, stage=train]
100%|██████████| 67/67 [00:04<00:00, 13.74it/s, accuracy=0.782, loss=0.844, stage=valid]
  0%|          | 0/134 [00:00<?, ?it/s]

EarlyStopping counter: 2 out of 3


100%|██████████| 134/134 [00:49<00:00,  2.71it/s, accuracy=0.979, loss=0.0696, stage=train]
100%|██████████| 67/67 [00:04<00:00, 13.86it/s, accuracy=0.777, loss=0.965, stage=valid]

EarlyStopping counter: 3 out of 3





In [24]:
model.save('model.bin')

In [41]:
y_true = valid_dataset.targets
preds = model.predict(valid_dataset, batch_size=len(y_true))
for i in preds:
  p = np.argmax(i, axis = 1)
  print()
  print(
      confusion_matrix(y_true, p)
  )
  print(
      classification_report(y_true, p)      
  )
  break








  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A


[[739  54  43]
 [ 62  39  16]
 [ 45  18  50]]
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       836
           1       0.35      0.33      0.34       117
           2       0.46      0.44      0.45       113

    accuracy                           0.78      1066
   macro avg       0.56      0.55      0.56      1066
weighted avg       0.77      0.78      0.77      1066

