In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from tqdm import tqdm

!pip install transformers

import transformers

In [None]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [3]:
def dataAnalyzer(targets, outputs):
  label_dict = {'offensive': 0, 'fake': 1, 'defamation': 2, 'hate': 3, 'non-hostile': 4}
  predCounts = [0, 0, 0, 0, 0]
  realCounts = [0, 0, 0, 0, 0]
  inpredCounts = [0, 0, 0, 0, 0]
  inrealCounts = [0, 0, 0, 0, 0]

  for i in range(len(targets)):
    for j in range(len(label_dict)):
      if targets[i][j] == 1:
        realCounts[j] += 1
        if targets[i][j] == outputs[i][j]:
          predCounts[j] += 1
      else:
        inrealCounts[j] += 1
        if targets[i][j] == outputs[i][j]:
          inpredCounts[j] += 1

  print("True +ve")
  print(predCounts)
  print("Total +ve")
  print(realCounts)
  print("True -ve")
  print(inpredCounts)
  print("Total -ve")
  print(inrealCounts)

In [4]:
def multi_hot_encoder(labels):
  label_array = [label.split(',') for label in labels]
  label_dict = {'offensive': 0, 'fake': 1, 'defamation': 2, 'hate': 3, 'non-hostile': 4}
  labels = np.zeros(shape=(len(label_array), len(label_dict)))
  for i, label in enumerate(label_array):
    for l in label:
      labels[i][(label_dict[l])] = 1
  
  return labels


def binary_encoder(labels):
  label_array = [label.split(',') for label in labels]
  label_dict = {'offensive': 0, 'fake': 1, 'defamation': 2, 'hate': 3, 'non-hostile': 4}
  labels = np.zeros(shape=len(label_array), dtype=np.int32)
  for i, label in enumerate(label_array):
    if label_dict[label[0]] < 4:
      labels[i] = 1

  return labels


In [5]:
# Constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
BERT_PATH = "bert-base-multilingual-uncased"
MODEL_PATH = "/content/drive//My Drive/IR_Hindi/Models_Multi/model9.bin"
MODEL_PATH_BINARY = "/content/drive//My Drive/IR_Hindi/Models_Binary/model3.bin"
TRAINING_FILE = "/content/drive//My Drive/IR_Hindi/train.csv"
VALIDATION_FILE = "/content/drive//My Drive/IR_Hindi/Constraint_Hindi_Valid - Sheet1.csv"
TEST_FILE = "/content/drive//My Drive/IR_Hindi/Test Set - test.csv"
FILE_SAVE_PATH = "/content/drive//My Drive/IR_Hindi/Quark_test_1.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




In [6]:
class Dataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }


In [7]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.2)
        self.lin1 = nn.Linear(768, 256)
        self.lin2 = nn.Linear(256, 5)

    def forward(self, ids, mask, token_type_ids):
        o = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o.pooler_output)
        output = nn.functional.relu(self.lin1(bo))
        output = self.lin2(output)
        return output

class BERT_Binary(nn.Module):
    def __init__(self):
        super(BERT_Binary, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.2)
        self.lin1 = nn.Linear(768, 256)
        self.lin2 = nn.Linear(256, 1)

    def forward(self, ids, mask, token_type_ids):
        o = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o.pooler_output)
        output = nn.functional.relu(self.lin1(bo))
        output = self.lin2(output)
        return output

In [8]:
# Change loss function for multi-label
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

def loss_fn_binary(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets.view(-1,1))


def train_fn(data_loader, model, optimizer, device, scheduler, isBinary):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        if isBinary:
          loss = loss_fn_binary(outputs, targets)
        else:
          loss = loss_fn(outputs, targets)
        
        loss.backward()
        optimizer.step()
        scheduler.step()


def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [9]:
def sentence_prediction(MODEL, sentence):
    tokenizer = TOKENIZER
    max_len = MAX_LEN
    review = str(sentence)
    review = " ".join(review.split())

    inputs = tokenizer.encode_plus(
        review, None, add_special_tokens=True, max_length=max_len, truncation=True,
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    padding_length = max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0)

    ids = ids.to(DEVICE, dtype=torch.long)
    token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
    mask = mask.to(DEVICE, dtype=torch.long)

    outputs = MODEL(ids=ids, mask=mask, token_type_ids=token_type_ids)

    outputs = torch.sigmoid(outputs).cpu().detach().numpy()
    return outputs[0]

In [None]:
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

def run(binaryClassification=False):
    df_train = pd.read_csv(TRAINING_FILE)
    df_valid = pd.read_csv(VALIDATION_FILE)
    
    # Url Removal
    for i,v in enumerate(df_train['Post']):
      if type(v)==float:
        break
      v = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', v, flags=re.MULTILINE)
      df_train.loc[i,'Post'] = v

    for i,v in enumerate(df_valid['Post']):
      if type(v)==float:
        break
      v = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', v, flags=re.MULTILINE)
      df_valid.loc[i,'Post'] = v

    if binaryClassification:
      train_targets = binary_encoder(df_train['Labels Set'])
      valid_targets = binary_encoder(df_valid['Labels Set'])
    else:
      train_targets = multi_hot_encoder(df_train['Labels Set'])
      valid_targets = multi_hot_encoder(df_valid['Labels Set'])

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = Dataset(review=df_train['Post'], target=train_targets)
    train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4)
    valid_dataset = Dataset(review=df_valid['Post'], target=valid_targets)
    valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(DEVICE)
    model = BERT_Binary() if binaryClassification else BERT()
    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_f1_score = 0
    save_path = MODEL_PATH_BINARY if binaryClassification else MODEL_PATH
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler, binaryClassification)
        outputs, targets = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        targets = np.array(targets)
        f1_score = metrics.f1_score(targets[:,:4], outputs[:,:4], average="weighted")
        print(f"F1 Score = {f1_score}")
        if f1_score > best_f1_score:
            torch.save(model.state_dict(), save_path)
            best_f1_score = f1_score


if __name__ == "__main__":
    run()


In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics

MODEL = BERT()
MODEL.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device(DEVICE)))
MODEL.to(DEVICE)
MODEL.eval()

# Just for reference
# label_dict = {'offensive': 0, 'fake': 1, 'defamation': 2, 'hate': 3, 'non-hostile': 4}

df_valid = pd.read_csv(VALIDATION_FILE)
valid_targets = multi_hot_encoder(df_valid['Labels Set'])
valid_dataset = Dataset(review=df_valid['Post'], target=valid_targets)
valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
)
outputs, targets = eval_fn(valid_data_loader, MODEL, DEVICE)
preds = np.array(outputs) >= 0.5
f1 = metrics.f1_score(targets, preds, average="weighted")
print(f"F1 Score = {f1}")

dataAnalyzer(targets, preds)


In [None]:
f1 = metrics.f1_score(np.array(targets)[:,:4], np.array(preds)[:,:4], average="weighted")
print(f"F1 Score = {f1}")

In [None]:
from sklearn import metrics

MODEL = BERT()
binary = BERT_Binary()
MODEL.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device(DEVICE)))
binary.load_state_dict(torch.load(MODEL_PATH_BINARY, map_location=torch.device(DEVICE)))
MODEL.to(DEVICE)
binary.to(DEVICE)
MODEL.eval()
binary.eval()

df_test = pd.read_csv(TEST_FILE)

test_data = df_test['Post']

for i,v in enumerate(df_train['Post']):
      if type(v)==float:
        break
      v = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', v, flags=re.MULTILINE)
      df_test.loc[i,'Post'] = v

preds = []
binary_preds = []
for post in test_data:
  pred = sentence_prediction(MODEL, post)
  binary_pred = sentence_prediction(binary, post)[0]
  preds.append(pred)
  binary_preds.append(binary_pred)


preds = np.array(preds) >= 0.5
binary_preds = np.array(binary_preds) >= 0.5
outs = []
label_dict = {0: 'offensive', 1: 'fake', 2: 'defamation', 3: 'hate', 4: 'non-hostile'}
for pred, binary_pred in zip(preds, binary_preds):
  s = ""
  if binary_pred == 0:
    s = "non-hostile, "
  else:
    for j, val in enumerate(pred):
      if val == 1:
        s += (label_dict[j] + ', ')
  outs.append(s[:-2])

to_save = list(zip(range(1,len(outs)+1), outs))
df = pd.DataFrame(to_save,
               columns =['Unique ID', 'Labels Set'])
df.to_csv(FILE_SAVE_PATH, index=False)

In [None]:
hCount, nhCount = 0, 0
cH, incH = 0, 0
for i in range(len(targets)):
  if targets[i] == 1:
    hCount += 1
    if targets[i] == outputs[i]:
      cH += 1
  else:
    nhCount += 1
    if targets[i] == outputs[i]:
      incH += 1

print("Hostile Count: ", hCount)
print("Correct Hostile: ", cH)
print("Non-hostile Count: ", nhCount)
print("Correct Non-hostile: ", incH)
# accuracy = metrics.accuracy_score(targets, outputs)
# print(f"Accuracy Score = {accuracy}")

F1 Score = 0.9963337988826816
Hostile Count:  2678
Correct Hostile:  2675
Non-hostile Count:  3050
Correct Non-hostile:  3032
