# For Fake, Hate, Offensive Datasets

# Code Objective:

*   mBERT Model for Fine Grained Evaluation
*   Constructing Problem from Multilabel Classification to independent Binary Classification





# Importing Libraries

In [1]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 12.8MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 58.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=95f221

In [2]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from transformers import glue_compute_metrics
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
from transformers import XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaConfig, XLMRobertaForSequenceClassification

print("GPU Torch Available = {}".format(torch.cuda.is_available()))
print("Torch Version = {}".format(torch.__version__))

GPU Torch Available = True
Torch Version = 1.7.0+cu101


# Data Loading

In [15]:
# Validation Data Loading (Only Run when Evaluation of Validation Data)

'''
Loading Dataset for Finegrained Multilabel Evaluation which has been transformed
as multiple independent binary classification (One vs Rest Approach)
'''

dataset = 'hate'                            # Choosing Dataset and Finetuned Model to Load (fake, offensive, hate)

# Test Data
file = '/content/Hostile_Validate.xlsx'
test_df = pd.read_excel(file)
train_df = test_df                          # Dummy Train dataframe (Unused)

In [31]:
# Test Data Loading (Only Run when Evaluation of Test Data)

'''
Loading Dataset for Finegrained Multilabel Evaluation which has been transformed
as multiple independent binary classification (One vs Rest Approach)
'''

dataset = 'offensive'                            # Choosing Dataset and Finetuned Model to Load (fake, offensive, hate)

# Test Data
file = '/content/Hostile_Hindi_Test.xlsx'
test_df = pd.read_excel(file, names = ['Unique ID','Post'])
train_df = test_df                          # Dummy Train dataframe (Unused)

In [22]:
# Data Preparation into Pandas Dataframe for Model Input

def get_data(a):
  Unique_ID = list(a['Unique ID'])
  sentence = list(a['Post'])
  
  # Appending dummy labels
  label = []
  for i in Unique_ID:
    label.append(0)

  raw_data_train = {'UID':Unique_ID,'sentence': sentence, 'label': label}
  df = pd.DataFrame(raw_data_train, columns = ['UID','sentence','label'])
  return df

test_data = get_data(test_df)
train_data = test_data                      # Dummy Train dataframe (Unused)

print(test_data[0:3])

   UID                                           sentence  label
0    1  कीस की को रोजगार चाहिए फिर नहीं कहना रोजगार नह...      0
1    3  कोई भी कांग्रेसी  ऊंची छत पर  रेलवे लाइन पर  ऊ...      0
2    4  अंडरवर्ल्ड डॉन छोटा राजन के भाई को बीजेपी द्वा...      0


# Model Parameters

In [7]:
model_name = 'Bert'

if (model_name == 'Bert'):
  # Bert Parameters
  config = BertConfig.from_pretrained('bert-base-multilingual-cased',num_labels=2)
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
  model = BertForSequenceClassification(config)
  print('BERT Model Loaded')
else:
  print('Choose correct Model')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…


BERT Model Loaded


# Data Preparation for Model Input

In [23]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.sentence = dataframe.sentence
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, index):
        sentence1 = str(self.sentence[index])

        inputs = self.tokenizer.encode_plus(sentence1,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            pad_to_max_length=True,
                                            return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'labels': torch.tensor(self.targets[index], dtype=torch.long)
               }

In [24]:
# Dataset for Input into Model
MAX_LEN = 128                                                 # Max Sequence Length
training_set = CustomDataset(train_data, tokenizer, MAX_LEN)  # Training Set
testing_set = CustomDataset(test_data, tokenizer, MAX_LEN)    # Validation Set

# Training and Evaluation Phase

In [25]:
# Device Mapping Select (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

# Training Arguments
training_args = TrainingArguments(output_dir="./models/model_name",
                                  overwrite_output_dir=True,
                                  do_train=True,
                                  do_eval=True,
                                  per_device_train_batch_size=28,
                                  per_device_eval_batch_size=28,
                                  num_train_epochs=20,
                                  logging_steps=100,
                                  logging_first_step=True,
                                  save_steps=0,
                                  evaluation_strategy ='epoch')

# Metric for Performance Evaluation
def compute_metrics(p):
  preds = np.argmax(p.predictions, axis=1)
  return glue_compute_metrics("mnli", preds, p.label_ids)

# Trainer for training Model
trainer = Trainer(model = model,
                  args = training_args,
                  train_dataset = training_set,
                  eval_dataset = testing_set,
                  compute_metrics = compute_metrics)

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
# Pretrained Model Load
model_path = '/content/drive/MyDrive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Weights/BERT_state_dict_' + dataset + '.pth'
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [None]:
# Once model is loaded from previous cell, no need to run this cell


# Training Model
trainer.train()

# Evaluation of Model on Validation Data
trainer.evaluate(testing_set)

# Prediction

In [33]:
'''
Load Model, predict on validation or test data and get labels for each dataset
For 3 different datasets (Fake, Hate, Offensive) 
we get 3 output numpy array of labels. 
'''

Label_Name = 'Offensive'   # Choose from (Fake, Offensive, Hate)


# Prediction
def prepare_features(seq_1, max_seq_length = 128, zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask


def predict(text):
  model.eval()
  input_feature, _ = prepare_features(text)
  if torch.cuda.is_available():
    input_feature = input_feature.cuda()
  output = model(input_feature)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction = pred_label[0].item()
  if (prediction == 0):
    return 'non_offensive',0
  else:
    return 'offensive',1

data = test_data

pred = []
pred_lab = []
for i in range(len(data)):
  text = data['sentence'][i]
  pred_text , pred_label = predict(text)
  pred.append(pred_text)
  pred_lab.append(pred_label)

pred_lab = np.array(pred_lab, dtype=np.float)
np.save('Pred_' + Label_Name + '_Label.npy',pred_lab)