In [1]:
import gc
import os
import sys
import time
import pickle
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
import math
import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import warnings
warnings.simplefilter('ignore')

In [2]:
! nvidia-smi

Wed Feb  1 12:56:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
EPOCHS = 12
lr = 2e-5
SEED = 42
MAX_LEN = 128
BATCH_SIZE = 24
accumulation_steps = 4
seed_everything(SEED)

In [6]:
model_path = '../input/huggingfacedebertav3variants/deberta-v3-small'
data_path = '../input/data-for-distilation' 
train = pd.read_csv('../input/data-for-distilation/Clinc_Train.csv')
valid = pd.read_csv('../input/data-for-distilation/Clinc_valid.csv')
n_classes = np.unique(train.Target).shape[0]
train.head(2)

Unnamed: 0,Text,Target,intent
0,what expression would i use to say i love you ...,61,translate
1,can you tell me how to say 'i do not speak muc...,61,translate


In [7]:
class ArcModule(nn.Module):
    def __init__(self, in_features=768, out_features=151, s = 10, m =0.5):
        super().__init__()
        self.in_features = 768
        self.out_features = 151
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_normal_(self.weight)

        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = torch.tensor(math.cos(math.pi - m))
        self.mm = torch.tensor(math.sin(math.pi - m) * m)

    def forward(self, inputs, labels):
        cos_th = F.linear(inputs, F.normalize(self.weight))
        cos_th = cos_th.clamp(-1, 1)
        sin_th = torch.sqrt(1.0 - torch.pow(cos_th, 2))
        cos_th_m = cos_th * self.cos_m - sin_th * self.sin_m
        # print(type(cos_th), type(self.th), type(cos_th_m), type(self.mm))
        cos_th_m = torch.where(cos_th > self.th, cos_th_m, cos_th - self.mm)

        cond_v = cos_th - self.th
        cond = cond_v <= 0
        cos_th_m[cond] = (cos_th - self.mm)[cond]

        if labels.dim() == 1:
            labels = labels.unsqueeze(-1)
        onehot = torch.zeros(cos_th.size()).cuda()
        labels = labels.type(torch.LongTensor).cuda()
        onehot.scatter_(1, labels, 1.0)
        outputs = onehot * cos_th_m + (1.0 - onehot) * cos_th
        outputs = outputs * self.s
        return outputs

In [8]:
class callback:
    def __init__(self):
        self.loss = list()
        self.model = list()
    
    def put(self, model, loss):
        self.loss.append(loss)
        self.model.append(model)

    def get_model(self):
        ind = np.argmin(self.loss)
        return self.model[ind]

def freeze(module):
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
    return freezed_parameters

class ClinicModel(nn.Module):
    def __init__(self, model_path):
        super(ClinicModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
#         freeze((self.model).embeddings)
        self.output = nn.Linear(768, n_classes)
        self.margin = ArcModule(in_features=768, out_features = n_classes)

    def forward(self, ids, mask):
        sequence_output = self.model(ids, mask)[0][:, 0, :]
        logits = self.output(sequence_output)#self.margin(sequence_output,151)
        return logits


class ClinicDataset(Dataset):
    def __init__(self, data,is_test=False):
        self.X = data['Text'].values
        self.Y = data['Target'].values
        self.is_test = is_test
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.MAX_LEN = 128
        
    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(self.X[idx],
            add_special_tokens=True,
            truncation=True,
            max_length=self.MAX_LEN
        )['input_ids'] 

        if not self.is_test:
            target_value = self.Y[idx]
      
        mask = [1]*len(inputs) + [0] * (self.MAX_LEN - len(inputs)) 
        mask = torch.tensor(mask, dtype=torch.long)
        
        if len(inputs) != self.MAX_LEN:
            inputs = inputs + [self.tokenizer.pad_token_id] * (self.MAX_LEN - len(inputs)) 
        ids = torch.tensor(inputs, dtype=torch.long)
        
        
        
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
            }
        
        else:
            targets = torch.FloatTensor(target_value)
            return {
                'ids': ids,
                'mask': mask,
                'targets': targets
            }
        
    def __len__(self):
        return len(self.Y)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
train_texts = train['Text'].values.tolist()
val_texts = valid['Text'].values.tolist()
train_labels = train['Target'].values.tolist()
val_labels = valid['Target'].values.tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class ClinicDatasetV2(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return {'ids':item.get('input_ids'),'mask':item.get('attention_mask'),'labels':item.get('labels')}

    def __len__(self):
        return len(self.labels)

train_loader = torch.utils.data.DataLoader(ClinicDatasetV2(train_encodings, train_labels),batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(ClinicDatasetV2(val_encodings, val_labels),batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
# for data in train_loader:
#     print(data)
#     break


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# train_loader = torch.utils.data.DataLoader(ClinicDataset(train), batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
# val_loader = torch.utils.data.DataLoader(ClinicDataset(valid), batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [11]:
# for data in train_loader:
#     print(data)
#     break

In [12]:
def valid_func(model,valid_loader):
    model.eval()
    bar = tqdm(valid_loader,file=sys.stdout)
    loss_fn = torch.nn.CrossEntropyLoss()
    PROB = []
    TARGETS = []
    losses = []
    PREDS = []

    with torch.no_grad():
        for batch_idx, (data) in enumerate(bar):
            input_ids = data['ids'].cuda()
            input_masks = data['mask'].cuda()
            targets = data['labels'].long().view(-1).cuda()

            logits = model(input_ids,input_masks)

            PREDS += [torch.argmax(logits, 1).detach().cpu()]
            TARGETS += [targets.detach().cpu()]

            loss = loss_fn (logits, targets)
            losses.append(loss.item())
           
            bar.set_description(f'loss: {loss.item():.5f}')

    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    accuracy = (PREDS==TARGETS).mean()
   
    loss_valid = np.mean(losses)
    return loss_valid, accuracy


In [13]:
use_amp = True
debug = False
gc.collect()
best_epoch_loss = np.inf
# train_loader = torch.utils.data.DataLoader(ClinicDataset(train), batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
# val_loader = torch.utils.data.DataLoader(ClinicDataset(valid), batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

net = ClinicModel(model_path)
net.cuda()
log_df = pd.DataFrame(columns = ['Epoch','Train_Loss','Valid_Loss','Valid_Accuracy'])

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(net.parameters(), lr = lr)    
param_optimizer = list(net.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                    num_training_steps=num_train_optimization_steps)# PyTorch scheduler
if use_amp:
        scaler = torch.cuda.amp.GradScaler()

for epoch in range(EPOCHS):
    start_time = time.time()
    avg_loss = 0.0
    net.train()
    tbar = tqdm(train_loader, file=sys.stdout)
    loss_list = []
    val_loss_list = []
    for step, data in enumerate(tbar):
        if debug and step == 10:
            print('Debug Mode. Only train on first 100 batches.')
            break
        input_ids = data['ids'].cuda()
        input_masks = data['mask'].cuda()
        targets = data['labels'].long().view(-1).cuda()
        if use_amp:
                with torch.cuda.amp.autocast():
                     pred = net(input_ids,input_masks)
                     loss = loss_fn(pred, targets)
                scaler.scale(loss).backward()
                if step % accumulation_steps == 0 or step == len(tbar) - 1:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    scheduler.step()
        else:
            pred = net(input_ids,input_masks)
            loss = loss_fn(pred, targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
       

    loss_list.append(loss.detach().cpu().item())
    avg_loss = np.round(np.mean(loss_list), 4)
    tbar.set_description(f"Epoch {epoch + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")
    vloss,vaccuracy = valid_func(net,val_loader )
    log_df.loc[len(log_df.index)] = [epoch+1,avg_loss,vloss,vaccuracy]
    print(f'Epoch--{epoch+1} ### Train loss---{avg_loss} ### Valid_Loss---{vloss} ### Valid_Acc---{vaccuracy}')
    if vloss<best_epoch_loss:
        best_epoch_loss = vloss
        PATH = f"debertav3-Base_epoch__{epoch}.pth"
        torch.save(net.state_dict(), PATH)
        print(f'Model Saved--epoch--{epoch+1}')
        
    
del train_loader
del net
del val_loader
gc.collect()


Some weights of the model checkpoint at ../input/huggingfacedebertav3variants/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification 

  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--1 ### Train loss---3.633 ### Valid_Loss---3.4455231758264393 ### Valid_Acc---0.42935483870967744
Model Saved--epoch--1


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--2 ### Train loss---0.8469 ### Valid_Loss---0.9511075984973174 ### Valid_Acc---0.8738709677419355
Model Saved--epoch--2


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--3 ### Train loss---0.3941 ### Valid_Loss---0.45812941732314916 ### Valid_Acc---0.9283870967741935
Model Saved--epoch--3


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--4 ### Train loss---0.3807 ### Valid_Loss---0.32332655870570587 ### Valid_Acc---0.9409677419354838
Model Saved--epoch--4


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--5 ### Train loss---0.1838 ### Valid_Loss---0.2732638687468492 ### Valid_Acc---0.9454838709677419
Model Saved--epoch--5


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--6 ### Train loss---0.0858 ### Valid_Loss---0.24106482359079215 ### Valid_Acc---0.9487096774193549
Model Saved--epoch--6


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--7 ### Train loss---0.2253 ### Valid_Loss---0.22488338154955553 ### Valid_Acc---0.9509677419354838
Model Saved--epoch--7


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--8 ### Train loss---0.0717 ### Valid_Loss---0.22035079693708282 ### Valid_Acc---0.9516129032258065
Model Saved--epoch--8


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--9 ### Train loss---0.0471 ### Valid_Loss---0.20861863942387013 ### Valid_Acc---0.9564516129032258
Model Saved--epoch--9


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--10 ### Train loss---0.0738 ### Valid_Loss---0.20790134782974537 ### Valid_Acc---0.9551612903225807
Model Saved--epoch--10


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--11 ### Train loss---0.0895 ### Valid_Loss---0.2068163343776877 ### Valid_Acc---0.9548387096774194
Model Saved--epoch--11


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--12 ### Train loss---0.1723 ### Valid_Loss---0.206066120419508 ### Valid_Acc---0.954516129032258
Model Saved--epoch--12


257

In [14]:
log_df

Unnamed: 0,Epoch,Train_Loss,Valid_Loss,Valid_Accuracy
0,1.0,3.633,3.445523,0.429355
1,2.0,0.8469,0.951108,0.873871
2,3.0,0.3941,0.458129,0.928387
3,4.0,0.3807,0.323327,0.940968
4,5.0,0.1838,0.273264,0.945484
5,6.0,0.0858,0.241065,0.94871
6,7.0,0.2253,0.224883,0.950968
7,8.0,0.0717,0.220351,0.951613
8,9.0,0.0471,0.208619,0.956452
9,10.0,0.0738,0.207901,0.955161
