In [None]:
from google.colab import drive
drive.mount("/content/drive")

import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from tqdm import tqdm
!pip install transformers
import transformers

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)



def multi_hot_encoder(labels):
  label_array = [label.replace(" ", "").split(',') for label in labels]
  label_dict = {'offensive': 0, 'fake': 1, 'defamation': 2, 'hate': 3, 'non-hostile': 4}
  labels = np.zeros(shape=(len(label_array), len(label_dict)))
  for i, label in enumerate(label_array):
    for l in label:
      labels[i][(label_dict[l])] = 1
  
  return labels


def binary_encoder(labels):
  label_array = [label.split(',') for label in labels]
  label_dict = {'offensive': 0, 'fake': 1, 'defamation': 2, 'hate': 3, 'non-hostile': 4}
  labels = np.zeros(shape=len(label_array), dtype=np.int32)
  for i, label in enumerate(label_array):
    if label_dict[label[0]] < 4:
      labels[i] = 1

  return labels



In [None]:
DEVICE = "cuda"
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
BERT_PATH = "bert-base-multilingual-uncased"
MODEL_PATH = "/content/drive//My Drive/IR_Hindi/Models_Multi/model7.bin"
MODEL_PATH_BINARY = "/content/drive//My Drive/IR_Hindi/Models_Binary/model3.bin"
TRAINING_FILE = "/content/drive//My Drive/IR_Hindi/train.csv"
VALIDATION_FILE = "/content/drive//My Drive/IR_Hindi/Constraint_Hindi_Valid - Sheet1.csv"
TEST_FILE = "/content/drive//My Drive/IR_Hindi/Test Set - test.csv"
FILE_SAVE_PATH = "/content/drive//My Drive/IR_Hindi/Techaos_Valid_7.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




In [None]:
class Dataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.2)
        self.lin1 = nn.Linear(768, 256)
        self.lin2 = nn.Linear(256, 5)

    def forward(self, ids, mask, token_type_ids):
        o = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o.pooler_output)
        # output = nn.functional.relu(self.lin1(bo))
        # output = self.lin2(output)
        return bo

class BERT_Binary(nn.Module):
    def __init__(self):
        super(BERT_Binary, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.2)
        self.lin1 = nn.Linear(768, 256)
        self.lin2 = nn.Linear(256, 1)

    def forward(self, ids, mask, token_type_ids):
        o = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o.pooler_output)
        output = nn.functional.relu(self.lin1(bo))
        output = self.lin2(output)
        return output

def sentence_prediction(MODEL, sentence):
    tokenizer = TOKENIZER
    max_len = MAX_LEN
    review = str(sentence)
    review = " ".join(review.split())

    inputs = tokenizer.encode_plus(
        review, None, add_special_tokens=True, max_length=max_len, truncation=True,
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    padding_length = max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0)

    ids = ids.to(DEVICE, dtype=torch.long)
    token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
    mask = mask.to(DEVICE, dtype=torch.long)

    outputs = MODEL(ids=ids, mask=mask, token_type_ids=token_type_ids)

    outputs = torch.sigmoid(outputs).cpu().detach().numpy()
    return outputs[0]

In [None]:
bert = BERT()
bert.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device(DEVICE)))
df_train = pd.read_csv(TRAINING_FILE)
df_valid = pd.read_csv(VALIDATION_FILE)
binaryClassification = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert.to(device)
bert.eval()
for i,v in enumerate(df_train['Post']):
  if type(v)==float:
    break
  v = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', v, flags=re.MULTILINE)
  df_train.loc[i,'Post'] = v

for i,v in enumerate(df_valid['Post']):
  if type(v)==float:
    break
  v = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', v, flags=re.MULTILINE)
  df_valid.loc[i,'Post'] = v

if binaryClassification:
  train_targets = binary_encoder(df_train['Labels Set'])
  valid_targets = binary_encoder(df_valid['Labels Set'])
else:
  train_targets = multi_hot_encoder(df_train['Labels Set'])
  valid_targets = multi_hot_encoder(df_valid['Labels Set'])

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

train_dataset = Dataset(review=df_train['Post'], target=train_targets)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4)

valid_dataset = Dataset(review=df_valid['Post'], target=valid_targets)

valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1)

In [None]:
def getSentenceEmbeddings(data_loader, model, device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
      ids = d["ids"]
      token_type_ids = d["token_type_ids"]
      mask = d["mask"]
      targets = d["targets"]

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)

      outputs = torch.sigmoid(model(ids, mask=mask, token_type_ids=token_type_ids))
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
  return np.array(fin_outputs), np.array(fin_targets)

def eval_fn(data_loader, model, device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
      ids = d["ids"]
      token_type_ids = d["token_type_ids"]
      mask = d["mask"]
      targets = d["targets"]

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)

      outputs = torch.sigmoid(model(ids=ids, mask=mask, token_type_ids=token_type_ids))
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
  return fin_outputs, fin_targets



In [None]:
train_data, train_targets = getSentenceEmbeddings(train_data_loader, bert, device)
valid_data, valid_targets = getSentenceEmbeddings(valid_data_loader, bert, device)

# inds = []
# for i, target in enumerate(train_targets):
#   if target[4] == 0:
#     inds.append(i)

# train_data = train_data[inds]
# train_targets = (train_targets[inds])

100%|██████████| 716/716 [00:49<00:00, 14.42it/s]
100%|██████████| 203/203 [00:07<00:00, 26.87it/s]


In [None]:
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

offensive = XGBClassifier(n_jobs=-1, objective="binary:logistic", max_depth=4)
fake = XGBClassifier(n_jobs=-1, objective="binary:logistic", max_depth=4)
defamation = XGBClassifier(n_jobs=-1, objective="binary:logistic", max_depth=4)
hate = XGBClassifier(n_jobs=-1, objective="binary:logistic", max_depth=4)
non_hostile = XGBClassifier(n_jobs=-1, objective="binary:logistic", max_depth=4)

offensive.fit(train_data, train_targets[:,0])
fake.fit(train_data, train_targets[:,1])
defamation.fit(train_data, train_targets[:,2])
hate.fit(train_data, train_targets[:,3])
non_hostile.fit(train_data, train_targets[:,4])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
from sklearn import metrics

binaryModel = BERT_Binary()
binaryModel.load_state_dict(torch.load(MODEL_PATH_BINARY, map_location=torch.device(device)))
binaryModel.to(device)

binary_preds, _ = eval_fn(valid_data_loader, binaryModel, device)
# binary_preds = []
# for post in df_valid['Post']:
#   binary_preds.append(sentence_prediction(binaryModel, post)[0])
binary_targets = binary_encoder(df_valid['Labels Set'])

binary_preds = np.array(binary_preds) >= 0.5
preds = np.zeros((binary_preds.shape[0] ,5))
preds[:,0] = offensive.predict(valid_data)
preds[:,1] = fake.predict(valid_data)
preds[:,2] = defamation.predict(valid_data)
preds[:,3] = hate.predict(valid_data)
preds[:,4] = non_hostile.predict(valid_data)

for i in range(preds.shape[0]):
  if binary_preds[i] == 0:
    preds[i] = np.zeros(5)

f1_score = metrics.f1_score(valid_targets[:,:4], preds[:, :4], average=None)
print("Classwise F1: ",f1_score)
f1_score = metrics.f1_score(valid_targets[:,:4], preds[:, :4], average="weighted")
print("Fine grained F1: ", f1_score)
f1 = metrics.f1_score(binary_targets, binary_preds, average="weighted")
print("Binary Classification: ", f1)
dataAnalyzer(valid_targets[:,:4], preds[:,:4])

In [None]:
df_test = pd.read_csv(TEST_FILE)
dummy_targets = np.zeros((len(df_test['Post']), 5))
test_dataset = Dataset(review=df_test['Post'], target=dummy_targets)

test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1)
test_data, _ = getSentenceEmbeddings(test_data_loader, bert, device)

binary_preds = []
for post in df_test['Post']:
  binary_preds.append(sentence_prediction(binaryModel, post)[0])
binary_preds = np.array(binary_preds) >= 0.5

preds = np.zeros((binary_preds.shape[0] ,4))
preds[:,0] = offensive.predict(test_data)
preds[:,1] = fake.predict(test_data)
preds[:,2] = defamation.predict(test_data)
preds[:,3] = hate.predict(test_data)

for i in range(preds.shape[0]):
  if binary_preds[i] == 0:
    preds[i] = np.zeros(4)

outs = []
label_dict = {0: 'offensive', 1: 'fake', 2: 'defamation', 3: 'hate', 4: 'non-hostile'}
for pred, binary_pred in zip(preds, binary_preds):
  s = ""
  if binary_pred == 0:
    s = "non-hostile, "
  else:
    for j, val in enumerate(pred):
      if val == 1:
        s += (label_dict[j] + ', ')
  outs.append(s[:-2])

to_save = list(zip(range(1,len(outs)+1), outs))
df = pd.DataFrame(to_save, 
               columns =['Unique ID', 'Labels Set'])
df.to_csv(FILE_SAVE_PATH, index=False)

100%|██████████| 203/203 [00:13<00:00, 15.06it/s]


In [None]:
def dataAnalyzer(targets, outputs):
  label_dict = {'offensive': 0, 'fake': 1, 'defamation': 2, 'hate': 3}
  predCounts = [0, 0, 0, 0]
  realCounts = [0, 0, 0, 0]
  inpredCounts = [0, 0, 0, 0]
  inrealCounts = [0, 0, 0, 0]

  for i in range(len(targets)):
    for j in range(len(label_dict)):
      if targets[i][j] == 1:
        realCounts[j] += 1
        if targets[i][j] == outputs[i][j]:
          predCounts[j] += 1
      else:
        inrealCounts[j] += 1
        if targets[i][j] == outputs[i][j]:
          inpredCounts[j] += 1

  print("True +ve")
  print(predCounts)
  print("Total +ve")
  print(realCounts)
  print("True -ve")
  print(inpredCounts)
  print("Total -ve")
  print(inrealCounts)

In [None]:
# The final weighted average of the four submissions to create the fifth submission.

dir_path = "/content/drive//My Drive/IR_Hindi/"
df_1 = pd.read_csv(dir_path + "Quark_1.csv")
df_2 = pd.read_csv(dir_path + "Quark_2.csv")
df_3 = pd.read_csv(dir_path + "Quark_3.csv")
df_4 = pd.read_csv(dir_path + "Quark_4.csv")

weights = np.array([0.58, 0.6088, 0.5832, 0.6149])
total_weight = weights.sum()

preds_1 = multi_hot_encoder(df_1['Labels Set'])
preds_2 = multi_hot_encoder(df_2['Labels Set'])
preds_3 = multi_hot_encoder(df_3['Labels Set'])
preds_4 = multi_hot_encoder(df_4['Labels Set'])

preds_5 = ((preds_1*weights[0] + preds_2*weights[1] + preds_3*weights[2] + preds_4*weights[3])/total_weight) >= 0.5

outs = []
label_dict = {0: 'offensive', 1: 'fake', 2: 'defamation', 3: 'hate', 4: 'non-hostile'}
for pred in preds_5:
  s = ""
  for j, val in enumerate(pred):
    if val == 1:
      s += (label_dict[j] + ', ')
  outs.append(s[:-2])

to_save = list(zip(range(1,len(outs)+1), outs))
df = pd.DataFrame(to_save, 
               columns =['Unique ID', 'Labels Set'])
df.to_csv(dir_path + "Quark_5.csv", index=False)