## Installs & Imports

In [None]:
!pip install --quiet transformers
!pip install --quiet pytorch-metric-learning
!pip install --q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.8/198.8 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.6 MB/s[0m e

In [None]:
import json

import numpy as np
import pandas as pd

import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from torch.optim import AdamW
import torch.nn as nn
from tqdm.notebook import tqdm

from transformers import DistilBertModel, DistilBertTokenizer, AutoTokenizer, AutoModelForMaskedLM

from pytorch_metric_learning import losses
import wandb
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    r2_score,
    classification_report
)
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Config

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class Config:
    def __init__(self):
        self.MAX_LEN = 128
        self.TOKENIZER = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
        # self.TOKENIZER = AutoTokenizer.from_pretrained('xlm-roberta-base')
        self.BATCH_SIZE = 4
        self.CL_EPOCHS = 25
        self.FT_EPOCHS = 25
        self.TRAINED_MODEL_PATH = "distilbert_cl_baseline.pt"
        self.DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
config = Config()

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [None]:
wandb_config = {
    "MAX_LEN" : config.MAX_LEN,
    "TOKENIZER" : config.TOKENIZER,
    "BATCH_SIZE" : config.BATCH_SIZE,
    "CL_EPOCHS" : config.CL_EPOCHS,
    "FT_EPOCHS" : config.FT_EPOCHS,
    "TRAINED_MODEL_PATH" : config.TRAINED_MODEL_PATH,
    "DEVICE" : config.DEVICE}
wandb_config

{'MAX_LEN': 128,
 'TOKENIZER': DistilBertTokenizer(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),
 'BATCH_SIZE': 4,
 'CL_EPOCHS': 25,
 'FT_EPOCHS': 25,
 'TRAINED_MODEL_PATH': 'distilbert_cl_baseline.pt',
 'DEVICE': device(type='cuda')}

In [None]:
wandb.init(project="nlp-project", name = 'distilbert-md-agreement-final', config = wandb_config)


[34m[1mwandb[0m: Currently logged in as: [33mtanmayja[0m ([33m544nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
!cp -r '/content/drive/MyDrive/Project' -d '/content/'

## Data Processing

In [None]:
# train = pd.read_json('/content/Project/HS-Brexit_train.json').transpose()
train = pd.read_json('/content/Project/MD-Agreement_train.json').transpose()
# train_ca = pd.read_json('/content/Project/ConvAbuse_train.json').transpose()
# train_ar = pd.read_json('/content/Project/ArMIS_train.json').transpose()

# test = pd.read_json('/content/Project/HS-Brexit_test.json').transpose()
test = pd.read_json('/content/Project/MD-Agreement_test.json').transpose()
# test_ca = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ConvAbuse_dataset/ConvAbuse_test.json').transpose()
# test_ar = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ArMIS_dataset/ArMIS_test.json').transpose()

# dev = pd.read_json('/content/Project/HS-Brexit_dev.json').transpose()
dev = pd.read_json('/content/Project/MD-Agreement_dev.json').transpose()
# dev_ca = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ConvAbuse_dataset/ConvAbuse_dev.json').transpose()
# dev_ar = pd.read_json('/content/drive/MyDrive/NLP_proj_data/ArMIS_dataset/ArMIS_dev.json').transpose()

# train = pd.concat([train_hs, train_md, train_ca, train_ar], axis=0)
# test = pd.concat([test_hs, test_md, test_ca, test_ar], axis=0)
# dev = pd.concat([dev_hs, dev_md, dev_ca, dev_ar], axis=0)


train.head()

Unnamed: 0,text,annotation task,number of annotations,annotations,annotators,lang,hard_label,soft_label,split,other_info
1,<user> <user> No way Jose!!,offensiveness detection,5,0,"Ann418,Ann266,Ann149,Ann730,Ann345",en,0,"{'0': 1.0, '1': 0.0}",train,{'domain': 'Elections2020'}
2,"Good god, what is the matter with people ?",offensiveness detection,5,0,"Ann733,Ann422,Ann779,Ann514,Ann777",en,0,"{'0': 1.0, '1': 0.0}",train,{'domain': 'Covid-19'}
3,<user> <user> <user> <user> Um the Kurds are h...,offensiveness detection,5,1,"Ann425,Ann511,Ann779,Ann420,Ann721",en,0,"{'0': 0.8, '1': 0.2}",train,{'domain': 'Elections2020'}
4,What is WRONG with these people?,offensiveness detection,5,0,"Ann632,Ann179,Ann701,Ann201,Ann661",en,0,"{'0': 1.0, '1': 0.0}",train,{'domain': 'BLM'}
5,<user> This earpiece too plus a wire on his sl...,offensiveness detection,5,10000,"Ann266,Ann168,Ann149,Ann381,Ann774",en,0,"{'0': 0.8, '1': 0.2}",train,{'domain': 'Elections2020'}


In [None]:
train['soft_label'].value_counts()

{'0': 1.0, '1': 0.0}                   2303
{'0': 0.8, '1': 0.2}                   1295
{'0': 0.6000000000000001, '1': 0.4}    1032
{'0': 0.4, '1': 0.6000000000000001}     852
{'0': 0.2, '1': 0.8}                    635
{'0': 0.0, '1': 1.0}                    475
Name: soft_label, dtype: int64

In [None]:
# df = pd.read_json('/content/HS-Brexit_train.json').transpose()
df = train.copy()
df_test = test.copy()
df_dev = dev.copy()

In [None]:
def normalize_hard_labels(label):
    return int(label)

df['hard_label'] = df['hard_label'].apply(normalize_hard_labels)
df_test['hard_label'] = df_test['hard_label'].apply(normalize_hard_labels)
df_dev['hard_label'] = df_dev['hard_label'].apply(normalize_hard_labels)

df['hard_label'].value_counts()

0    4630
1    1962
Name: hard_label, dtype: int64

In [None]:
df = df.dropna()
df_test = df_test.dropna()
df_dev = df_dev.dropna()

## Custom Dataset

In [None]:
class CustomDataset:
    def __init__(self, df):
        self.tokenizer = config.TOKENIZER
        self.text = df["text"].tolist()
        self.targets = [int(t) for t in df["hard_label"].tolist()]
        self.soft_labels = df['soft_label'].tolist()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            str(self.text[index]),
            None,
            add_special_tokens = True,
            max_length = config.MAX_LEN,
            padding='max_length',
            truncation = True
        )

        input_ids = inputs['input_ids']
        input_mask = inputs['attention_mask']

        return{
                'input_ids': torch.tensor(input_ids, dtype=torch.long),
                'input_mask': torch.tensor(input_mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float),
                'soft_labels': torch.tensor(self.soft_labels[index]['0'], dtype=torch.float)
            }

## Data Loader

In [None]:
train_dataset = CustomDataset(df)
train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset(df_test)
test_dataloader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=False)

dev_dataset = CustomDataset(df_dev)
dev_dataloader = DataLoader(dev_dataset, batch_size=config.BATCH_SIZE, shuffle=False)

In [None]:
for batch in train_dataloader:
    print(batch['input_ids'])
    break

tensor([[   101,  10313,  45015,  25785,    111,  10392,  10410,    132,  10119,
          12922,  30511,    133,  29115,    135,  11023,  22881,  10114,    169,
          19774,  65042,  19826,    117,  11170,  25905,  10107,  10708,  10226,
          21766,  15230,    117,  31886,  18123,  10108,  14726,  10891,  10479,
          16938,    112,    188,  13145,  10957,    107, 109628,  38868,    107,
            119,    138,  48333,  10114,  72734,    112,    187,    187,  10343,
          31444,    107,  10104,  46128,  51818,    107,    111,  10392,  10410,
            132,    107,  10478,  18777,  18089,  11203,    107,  29956,  13820,
            119,  11696,  12647,  27874,  10380,  16745,  10108,  10380,  11299,
          25069,    102,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,    

In [None]:
for batch in train_dataloader:
    print(batch['soft_labels'])
    break

tensor([1.0000, 1.0000, 0.6000, 1.0000])


## Model

In [None]:
class TransformerModel(torch.nn.Module):
    def __init__(self):
        super(TransformerModel, self).__init__()
        self.model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
        # self.model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
        self.drop = torch.nn.Dropout(0.3)

    def forward(self, ids, mask):
        outputs = self.model(ids, mask)
        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]
        return self.drop(pooled_output)

## CL Engine

In [None]:
def loss_fn(outputs, targets):
    return losses.TripletMarginLoss()(outputs, targets)


def train(epoch, model, training_loader, device, optimizer):
    progress_bar = tqdm(
        training_loader,
        total=len(training_loader),
        desc='Epoch ' + str(epoch + 1)
    )
    model.train()
    train_loss = 0
    for step, data in enumerate(progress_bar):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['input_mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        train_loss += loss.item()
        progress_bar.set_postfix({"batch_loss": loss.item()})

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return train_loss / len(training_loader)

## Warming up DistilBERT with Contrastive Learning

In [None]:
def run(df):
    # Creating dataloader
    train_dataset = CustomDataset(df)
    train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

    # GPU check and setting the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))

    # Object of model and setting to device
    model = TransformerModel()
    model.to(device)

    # Model parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    # Optimizer and Scheduler
    optimizer = torch.optim.AdamW(optimizer_parameters, lr=3e-3)

    # Training loop
    best_loss = float('inf')
    for epoch in range(config.CL_EPOCHS):
        loss = train(epoch, model, train_dataloader, device, optimizer)
        print(f"Triplet Loss: {loss}")
        wandb.log({
            'train_triplet_loss' : loss
        })
        if loss < best_loss:
            torch.save(model.state_dict(), config.TRAINED_MODEL_PATH)
            best_loss = loss

In [None]:
run(df)

Tesla T4


Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.0511946042276833


Epoch 2:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04625456657733099


Epoch 3:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04440344214434871


Epoch 4:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04268646662296429


Epoch 5:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04353125467633679


Epoch 6:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.041508215299278534


Epoch 7:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04400170212515485


Epoch 8:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04434917509460876


Epoch 9:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04231969004234715


Epoch 10:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.040988294412246315


Epoch 11:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04194275158241581


Epoch 12:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04117718509379187


Epoch 13:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04159520982789527


Epoch 14:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04091088503691847


Epoch 15:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.040475522029808114


Epoch 16:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04167605152080125


Epoch 17:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04098158910897678


Epoch 18:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.041416217692510575


Epoch 19:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04038334618440097


Epoch 20:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.03975761937965341


Epoch 21:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.041021993279782606


Epoch 22:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04144534798456555


Epoch 23:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.04030219904142567


Epoch 24:   0%|          | 0/1648 [00:00<?, ?it/s]

Triplet Loss: 0.040698491014852144


Epoch 25:   0%|          | 0/1648 [00:00<?, ?it/s]

## DistilBERT for classification Model

In [None]:
class DistilBertClassifier(torch.nn.Module):
    def __init__(self, warmed_up_model):
        super(DistilBertClassifier, self).__init__()
        self.model = warmed_up_model
        for param in self.model.parameters():
            param.requires_grad = True
        self.drop = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 1)

    def forward(self, ids, mask):
        outputs = self.model(ids, mask)
        o = self.linear(outputs)
        return self.drop(o)

In [None]:
warmed_up_model = TransformerModel()
warmed_up_model.load_state_dict(torch.load("/content/distilbert_cl_baseline.pt"))
warmed_up_model.to(config.DEVICE)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TransformerModel(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Line

## Finetuning Engine

In [None]:
# def loss_fn(outputs, targets):
#     return torch.nn.BCEWithLogitsLoss()(outputs, targets)

# Regression
def loss_fn(outputs, targets):
    return torch.nn.MSELoss()(outputs, targets)

def ce_loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

def train(epoch, model, training_loader, device, optimizer):
    progress_bar = tqdm(
        training_loader,
        total=len(training_loader),
        desc='Epoch ' + str(epoch + 1)
    )
    model.train()
    for step, data in enumerate(progress_bar):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['input_mask'].to(device, dtype=torch.long)
        targets = data['soft_labels'].to(device, dtype=torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets.view(-1, 1))
        progress_bar.set_postfix({"batch_loss": loss.item()})

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def validation(epoch, model,testing_loader,device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['input_mask'].to(device, dtype=torch.long)

            targets = data['soft_labels'].to(device, dtype=torch.float)
            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

## Finetune model for classification

In [None]:
def finetune(df_train, df_dev, warmed_up_model):
    # train_size = 0.8
    # train_dataset = df.sample(frac=train_size, random_state=200)
    val_dataset = df_dev.reset_index(drop=True)
    train_dataset = df_train.reset_index(drop=True)

    # print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("VAL Dataset: {}".format(val_dataset.shape))

    training_set = CustomDataset(train_dataset)
    val_set = CustomDataset(val_dataset)

    train_params = {'batch_size': config.BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }

    val_params = {'batch_size': config.BATCH_SIZE,
                   'shuffle': False,
                   'num_workers': 0
                   }

    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Creating dataloader
    train_dataset = CustomDataset(df_train)
    train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)

    # GPU check and setting the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))

    # Object of model and setting to device
    model = DistilBertClassifier(warmed_up_model)
    model.to(device)

    # Model parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    # Optimizer and Scheduler
    optimizer = torch.optim.AdamW(optimizer_parameters, lr=3e-5)

    best_ce_loss = float('inf')
    for epoch in range(config.FT_EPOCHS):
        loss = train(epoch, model, train_dataloader, device, optimizer)
        outputs, targets = validation(epoch, model, val_loader, device)
        # outputs = np.array(outputs) >= 0.5
        # accuracy = accuracy_score(targets, outputs)
        # print(f"Accuracy Score = {accuracy}")
        # if accuracy > best_acc:
        #     torch.save(model.state_dict(), "finetuned_distil_bert_regress.pt")
        #     best_acc = accuracy
        val_loss = loss_fn(torch.tensor(outputs, dtype=float), torch.tensor(targets, dtype=float).view(-1, 1))
        ce_loss = ce_loss_fn(torch.tensor(outputs, dtype=float), torch.tensor(targets, dtype=float).view(-1, 1))
        print(f"MSE Loss = {val_loss}")
        print(f"CE Loss = {ce_loss}")
        wandb.log({
            'Val MSE Loss' : val_loss,
            'Val CE Loss' : ce_loss
        })
        if ce_loss < best_ce_loss:
            torch.save(model.state_dict(), "finetuned_distil_bert_regress.pt")
            best_ce_loss = ce_loss

In [None]:
finetune(df, df_dev, warmed_up_model)

TRAIN Dataset: (6592, 10)
VAL Dataset: (1104, 10)
Tesla T4


Epoch 1:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.11848814993239534
CE Loss = 0.6617051196334577


Epoch 2:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.11881454022520227
CE Loss = 0.6623860431984685


Epoch 3:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.11802258063071022
CE Loss = 0.6607244288479314


Epoch 4:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.11803297189162229
CE Loss = 0.6607465870832183


Epoch 5:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.11633805930405162
CE Loss = 0.6571412062552437


Epoch 6:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.10680574936336379
CE Loss = 0.636873963606989


Epoch 7:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.103281747527076
CE Loss = 0.6295567840203287


Epoch 8:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.10365579949535343
CE Loss = 0.6304490115222396


Epoch 9:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.10373067957431309
CE Loss = 0.6301914410302145


Epoch 10:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.11399879826138155
CE Loss = 0.65208402760363


Epoch 11:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.10190651518326382
CE Loss = 0.6266453769252799


Epoch 12:   0%|          | 0/1648 [00:00<?, ?it/s]

MSE Loss = 0.10316221013051229
CE Loss = 0.629606017321509


Epoch 13:   0%|          | 0/1648 [00:00<?, ?it/s]

In [None]:
device = 'cuda'

In [None]:
final_model = DistilBertClassifier(warmed_up_model)
final_model.load_state_dict(torch.load("/content/finetuned_distil_bert_regress.pt"))
final_model.to(device)

In [None]:
def eval_test(df_test, model):
  test_set = CustomDataset(df_test)
  test_params = {'batch_size': config.BATCH_SIZE,
                  'shuffle': False,
                  'num_workers': 0
                  }
  test_loader = DataLoader(test_set, **test_params)
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
      for _, data in enumerate(test_loader, 0):
          ids = data['input_ids'].to(device, dtype=torch.long)
          mask = data['input_mask'].to(device, dtype=torch.long)

          targets = data['soft_labels'].to(device, dtype=torch.float)
          outputs = model(ids, mask)
          fin_targets.extend(targets.cpu().detach().numpy().tolist())
          fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

  ce_loss = ce_loss_fn(torch.tensor(fin_outputs, dtype=float), torch.tensor(fin_targets, dtype=float).view(-1, 1))
  print('Final CE Loss on Test Set: ', ce_loss.item())

In [None]:
eval_test(df_test, final_model)