## Installs & Imports

In [None]:
!pip install --quiet transformers
!pip install --quiet pytorch-metric-learning

In [None]:
import json

import numpy as np
import pandas as pd

import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from torch.optim import AdamW
import torch.nn as nn
from tqdm.notebook import tqdm

from transformers import DistilBertModel, DistilBertTokenizer

from pytorch_metric_learning import losses

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    r2_score,
    classification_report
)

## Config

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class Config:
    def __init__(self):
        self.MAX_LEN = 128
        self.TOKENIZER = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
        self.BATCH_SIZE = 4
        self.EPOCHS = 2
        self.TRAINED_MODEL_PATH = "distilbert_cl_baseline.pt"
        self.DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
config = Config()

## Data Processing

In [None]:
train_hs = pd.read_json('/content/drive/MyDrive/NLP_proj_data/HS-Brexit_dataset/HS-Brexit_train.json').transpose()
train_md = pd.read_json('/content/drive/MyDrive/NLP_proj_data/MD-Agreement_dataset/MD-Agreement_train.json').transpose()
train_ca = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ConvAbuse_dataset/ConvAbuse_train.json').transpose()
train_ar = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ArMIS_dataset/ArMIS_train.json').transpose()

test_hs = pd.read_json('/content/drive/MyDrive/NLP_proj_data/HS-Brexit_dataset/HS-Brexit_test.json').transpose()
test_md = pd.read_json('/content/drive/MyDrive/NLP_proj_data/MD-Agreement_dataset/MD-Agreement_test.json').transpose()
test_ca = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ConvAbuse_dataset/ConvAbuse_test.json').transpose()
test_ar = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ArMIS_dataset/ArMIS_test.json').transpose()

dev_hs = pd.read_json('/content/drive/MyDrive/NLP_proj_data/HS-Brexit_dataset/HS-Brexit_dev.json').transpose()
dev_md = pd.read_json('/content/drive/MyDrive/NLP_proj_data/MD-Agreement_dataset/MD-Agreement_dev.json').transpose()
dev_ca = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ConvAbuse_dataset/ConvAbuse_dev.json').transpose()
dev_ar = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ArMIS_dataset/ArMIS_dev.json').transpose()

train = pd.concat([train_hs, train_md, train_ca, train_ar], axis=0)
test = pd.concat([test_hs, test_md, test_ca, test_ar], axis=0)
dev = pd.concat([dev_hs, dev_md, dev_ca, dev_ar], axis=0)


train.head()

Unnamed: 0,text,annotation task,number of annotations,annotations,annotators,lang,hard_label,soft_label,split,other_info
1,<user> <user> I'm so glad about #Brexit.. My a...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
2,RT <user>: There was more to #Brexit than immi...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
3,"At the end of the day, the leave campaign won ...",hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
4,So the reducing migration thing wasn't quite w...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
5,A Brit Immigrant Asks Britain to Become India’...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...


In [None]:
train['soft_label'].value_counts()

{'0': 1.0, '1': 0.0}                    4964
{'0': 0.8, '1': 0.2}                    1307
{'0': 0.6000000000000001, '1': 0.4}     1038
{'0': 0.0, '1': 1.0}                     862
{'0': 0.4, '1': 0.6000000000000001}      856
{'0': 0.2, '1': 0.8}                     639
{'0': 0.67, '1': 0.33}                   184
{'0': 0.33, '1': 0.67}                   136
{'0': 0.5, '1': 0.5}                     135
{'0': 0.667, '1': 0.333}                 105
{'0': 0.333, '1': 0.667}                  88
{'0': 0.8300000000000001, '1': 0.17}      74
{'0': 0.75, '1': 0.25}                    14
{'0': 0.17, '1': 0.8300000000000001}      13
{'0': 0.25, '1': 0.75}                    10
{'0': 0.167, '1': 0.833}                   4
{'0': 0.833, '1': 0.167}                   2
Name: soft_label, dtype: int64

In [None]:
# df = pd.read_json('/content/HS-Brexit_train.json').transpose()
df = train.copy()
df_test = test.copy()
df_dev = dev.copy()

In [None]:
def normalize_hard_labels(label):
    return int(label)

df['hard_label'] = df['hard_label'].apply(normalize_hard_labels)
df_test['hard_label'] = df_test['hard_label'].apply(normalize_hard_labels)
df_dev['hard_label'] = df_dev['hard_label'].apply(normalize_hard_labels)

df['hard_label'].value_counts()

0    7738
1    2693
Name: hard_label, dtype: int64

In [None]:
df = df.dropna()
df_test = df_test.dropna()
df_dev = df_dev.dropna()

## Custom Dataset

In [None]:
class CustomDataset:
    def __init__(self, df):
        self.tokenizer = config.TOKENIZER
        self.text = df["text"].tolist()
        self.targets = [int(t) for t in df["hard_label"].tolist()]
        self.soft_labels = df['soft_label'].tolist()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            str(self.text[index]),
            None,
            add_special_tokens = True,
            max_length = config.MAX_LEN,
            padding='max_length',
            truncation = True
        )

        input_ids = inputs['input_ids']
        input_mask = inputs['attention_mask']

        return{
                'input_ids': torch.tensor(input_ids, dtype=torch.long),
                'input_mask': torch.tensor(input_mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float),
                'soft_labels': torch.tensor(self.soft_labels[index]['0'], dtype=torch.float)
            }

## Data Loader

In [None]:
train_dataset = CustomDataset(df)
train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset(df_test)
test_dataloader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

dev_dataset = CustomDataset(df_dev)
dev_dataloader = DataLoader(dev_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

In [None]:
for batch in train_dataloader:
    print(batch['input_ids'])
    break

tensor([[   101,    195,    107,  12229,  10477,    168,  18980,    107,    131,
            107, 100393,  13028,  12229,  14854,  12277,    146,  10309,  10472,
            169,  12483,    136,    107,    117,    107,  12229,  10477,    168,
          29115,    107,    131,    107,    146,  10134,  10135,  15127,  21997,
          14039,    119,    119,    119,    119,    119,    107,    117,    107,
          18980,    107,    131,    107,  24781,  10149,  13028,  41549,  10911,
          13028,  10309,  10135,  20442,  21997,  14039,  12820,  11858,    136,
            107,    117,    107,  29115,    107,    131,    107,    146,    112,
            181,  19369,  10157,  10111,  10529,  87202,  10114,  10149,    117,
          10473,    177,    112,    181,  10472,  30918,  10271,    119,    107,
            197,    102,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,    

In [None]:
for batch in train_dataloader:
    print(batch['soft_labels'])
    break

tensor([1.0000, 1.0000, 0.8300, 0.6000])


## Model

In [None]:
class TransformerModel(torch.nn.Module):
    def __init__(self):
        super(TransformerModel, self).__init__()
        self.model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.drop = torch.nn.Dropout(0.3)

    def forward(self, ids, mask):
        outputs = self.model(ids, attention_mask=mask)
        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]
        return self.drop(pooled_output)

## CL Engine

In [None]:
def loss_fn(outputs, targets):
    return losses.TripletMarginLoss()(outputs, targets)


def train(epoch, model, training_loader, device, optimizer):
    progress_bar = tqdm(
        training_loader,
        total=len(training_loader),
        desc='Epoch ' + str(epoch + 1)
    )
    model.train()
    train_loss = 0
    for step, data in enumerate(progress_bar):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['input_mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        train_loss += loss.item()
        progress_bar.set_postfix({"batch_loss": loss.item()})

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return train_loss / len(training_loader)

## Warming up DistilBERT with Contrastive Learning

In [None]:
def run(df):
    # Creating dataloader
    train_dataset = CustomDataset(df)
    train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

    # GPU check and setting the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))

    # Object of model and setting to device
    model = TransformerModel()
    model.to(device)

    # Model parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    # Optimizer and Scheduler
    optimizer = torch.optim.AdamW(optimizer_parameters, lr=3e-5)

    # Training loop
    best_loss = float('inf')
    for epoch in range(config.EPOCHS):
        loss = train(epoch, model, train_dataloader, device, optimizer)
        print(f"Triplet Loss: {loss}")
        if loss < best_loss:
            torch.save(model.state_dict(), config.TRAINED_MODEL_PATH)
            best_loss = loss

In [None]:
run(df)

Tesla T4


Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1:   0%|          | 0/2608 [00:00<?, ?it/s]

Triplet Loss: 0.04455796299727942


Epoch 2:   0%|          | 0/2608 [00:00<?, ?it/s]

Triplet Loss: 0.038563121056099894


## DistilBERT for classification Model

In [None]:
class DistilBertClassifier(torch.nn.Module):
    def __init__(self, warmed_up_model):
        super(DistilBertClassifier, self).__init__()
        self.model = warmed_up_model
        for param in self.model.parameters():
            param.requires_grad = False
        self.drop = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 1)

    def forward(self, ids, mask):
        outputs = self.model(ids, mask)
        o = self.linear(outputs)
        return self.drop(o)

In [None]:
warmed_up_model = TransformerModel()
warmed_up_model.load_state_dict(torch.load("/content/distilbert_cl_baseline.pt"))
warmed_up_model.to(config.DEVICE)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TransformerModel(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Line

## Finetuning Engine

In [None]:
# def loss_fn(outputs, targets):
#     return torch.nn.BCEWithLogitsLoss()(outputs, targets)

# Regression
def loss_fn(outputs, targets):
    return torch.nn.MSELoss()(outputs, targets)

def train(epoch, model, training_loader, device, optimizer):
    progress_bar = tqdm(
        training_loader,
        total=len(training_loader),
        desc='Epoch ' + str(epoch + 1)
    )
    model.train()
    for step, data in enumerate(progress_bar):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['input_mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets.view(-1, 1))
        progress_bar.set_postfix({"batch_loss": loss.item()})

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def validation(epoch, model,testing_loader,device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['input_mask'].to(device, dtype=torch.long)

            targets = data['targets'].to(device, dtype=torch.float)
            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

## Finetune model for classification

In [None]:
def finetune(df, warmed_up_model):
    # train_size = 0.8
    # train_dataset = df.sample(frac=train_size, random_state=200)
    test_dataset = df_test.reset_index(drop=True)
    train_dataset = df.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(test_dataset.shape))

    training_set = CustomDataset(train_dataset)
    testing_set = CustomDataset(test_dataset)

    train_params = {'batch_size': config.BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }

    test_params = {'batch_size': config.BATCH_SIZE,
                   'shuffle': True,
                   'num_workers': 0
                   }

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    # Creating dataloader
    train_dataset = CustomDataset(df)
    train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

    # GPU check and setting the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))

    # Object of model and setting to device
    model = DistilBertClassifier(warmed_up_model)
    model.to(device)

    # Model parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    # Optimizer and Scheduler
    optimizer = torch.optim.AdamW(optimizer_parameters, lr=3e-5)

    best_val_loss = float('inf')
    for epoch in range(config.EPOCHS):
        loss = train(epoch, model, train_dataloader, device, optimizer)
        outputs, targets = validation(epoch, model, testing_loader, device)
        # outputs = np.array(outputs) >= 0.5
        # accuracy = accuracy_score(targets, outputs)
        # print(f"Accuracy Score = {accuracy}")
        # if accuracy > best_acc:
        #     torch.save(model.state_dict(), "finetuned_distil_bert_regress.pt")
        #     best_acc = accuracy
        val_loss = loss_fn(torch.tensor(outputs, dtype=float), torch.tensor(targets, dtype=float))
        print(f"MSE Loss = {val_loss}")
        if val_loss < best_val_loss:
            torch.save(model.state_dict(), "finetuned_distil_bert_regress.pt")
            best_val_loss = val_loss

In [None]:
finetune(df, warmed_up_model)

FULL Dataset: (10431, 10)
TRAIN Dataset: (10431, 10)
TEST Dataset: (4210, 10)
Tesla T4


Epoch 1:   0%|          | 0/2608 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


MSE Loss = 0.2759084137661302


Epoch 2:   0%|          | 0/2608 [00:00<?, ?it/s]

MSE Loss = 0.28227244712181904
