In [1]:
import gc
import os
import sys
import time
import pickle
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
import math
import torch
from pathlib import Path
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from torch.nn.quantized import QFunctional
from torch.quantization import quantize_dynamic
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import warnings
warnings.simplefilter('ignore')

In [2]:
! nvidia-smi

Wed Feb  1 19:23:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
EPOCHS = 12
lr = 2e-5
SEED = 42
MAX_LEN = 128
BATCH_SIZE = 16
accumulation_steps = 4
seed_everything(SEED)

In [6]:
model_path = '../input/huggingfacedebertav3variants/deberta-v3-xsmall'
data_path = '../input/data-for-distilation' 
train = pd.read_csv('../input/data-for-distilation/Clinc_Train.csv')
valid = pd.read_csv('../input/data-for-distilation/Clinc_valid.csv')
n_classes = np.unique(train.Target).shape[0]
train.head(2)

Unnamed: 0,Text,Target,intent
0,what expression would i use to say i love you ...,61,translate
1,can you tell me how to say 'i do not speak muc...,61,translate


In [7]:
class callback:
    def __init__(self):
        self.loss = list()
        self.model = list()
    
    def put(self, model, loss):
        self.loss.append(loss)
        self.model.append(model)

    def get_model(self):
        ind = np.argmin(self.loss)
        return self.model[ind]

def freeze(module):
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
    return freezed_parameters

class ArcModule(nn.Module):
    def __init__(self, in_features=768, out_features=151, s = 10, m =0.5):
        super().__init__()
        self.in_features = 768
        self.out_features = 151
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_normal_(self.weight)

        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = torch.tensor(math.cos(math.pi - m))
        self.mm = torch.tensor(math.sin(math.pi - m) * m)

    def forward(self, inputs, labels):
        cos_th = F.linear(inputs, F.normalize(self.weight))
        cos_th = cos_th.clamp(-1, 1)
        sin_th = torch.sqrt(1.0 - torch.pow(cos_th, 2))
        cos_th_m = cos_th * self.cos_m - sin_th * self.sin_m
        # print(type(cos_th), type(self.th), type(cos_th_m), type(self.mm))
        cos_th_m = torch.where(cos_th > self.th, cos_th_m, cos_th - self.mm)

        cond_v = cos_th - self.th
        cond = cond_v <= 0
        cos_th_m[cond] = (cos_th - self.mm)[cond]

        if labels.dim() == 1:
            labels = labels.unsqueeze(-1)
        onehot = torch.zeros(cos_th.size()).cuda()
        labels = labels.type(torch.LongTensor).cuda()
        onehot.scatter_(1, labels, 1.0)
        outputs = onehot * cos_th_m + (1.0 - onehot) * cos_th
        outputs = outputs * self.s
        return outputs
    
class ClinicModel(nn.Module):
    def __init__(self, model_path):
        super(ClinicModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
#         freeze((self.model).embeddings)
        self.output = nn.Linear(768, n_classes)
        self.margin = ArcModule(in_features=768, out_features = n_classes)

    def forward(self, ids, mask):
        sequence_output = self.model(ids, mask)[0][:, 0, :]
        logits = self.output(sequence_output)#self.margin(sequence_output,151)
        return logits

class ClinicModel_student(nn.Module):
    def __init__(self, model_path):
        super(ClinicModel_student, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
#         freeze((self.model).embeddings)
        self.output = nn.Linear(384, n_classes)

    def forward(self, ids, mask):
        sequence_output = self.model(ids, mask)[0][:, 0, :]
        logits = self.output(sequence_output)
        return logits

class ClinicDataset(Dataset):
    def __init__(self, data,is_test=False):
        self.X = data['Text'].values
        self.Y = data['Target'].values
        self.is_test = is_test
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.MAX_LEN = 128
        
    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(self.X[idx],
            add_special_tokens=True,
            truncation=True,
            max_length=self.MAX_LEN
        )['input_ids'] 

        if not self.is_test:
            target_value = self.Y[idx]
      
        mask = [1]*len(inputs) + [0] * (self.MAX_LEN - len(inputs)) 
        mask = torch.tensor(mask, dtype=torch.long)
        
        if len(inputs) != self.MAX_LEN:
            inputs = inputs + [self.tokenizer.pad_token_id] * (self.MAX_LEN - len(inputs)) 
        ids = torch.tensor(inputs, dtype=torch.long)
        
        
        
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
            }
        
        else:
            targets = torch.FloatTensor(target_value)
            return {
                'ids': ids,
                'mask': mask,
                'targets': targets
            }
        
    def __len__(self):
        return len(self.Y)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
train_texts = train['Text'].values.tolist()
val_texts = valid['Text'].values.tolist()
train_labels = train['Target'].values.tolist()
val_labels = valid['Target'].values.tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class ClinicDatasetV2(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return {'ids':item.get('input_ids'),'mask':item.get('attention_mask'),'labels':item.get('labels')}

    def __len__(self):
        return len(self.labels)

train_loader = torch.utils.data.DataLoader(ClinicDatasetV2(train_encodings, train_labels),batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(ClinicDatasetV2(val_encodings, val_labels),batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
# for data in train_loader:
#     print(data)
#     break


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def valid_func(model,valid_loader):
    model.eval()
    bar = tqdm(valid_loader,file=sys.stdout)
    loss_fn = torch.nn.CrossEntropyLoss()
    PROB = []
    TARGETS = []
    losses = []
    PREDS = []

    with torch.no_grad():
        for batch_idx, (data) in enumerate(bar):
            input_ids = data['ids'].cuda()
            input_masks = data['mask'].cuda()
            targets = data['labels'].long().view(-1).cuda()

            logits = model(input_ids,input_masks)

            PREDS += [torch.argmax(logits, 1).detach().cpu()]
            TARGETS += [targets.detach().cpu()]

            loss = loss_fn (logits, targets)
            losses.append(loss.item())
           
            bar.set_description(f'loss: {loss.item():.5f}')

    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    accuracy = (PREDS==TARGETS).mean()
   
    loss_valid = np.mean(losses)
    return loss_valid, accuracy


In [10]:
teacher = ClinicModel('../input/huggingfacedebertav3variants/deberta-v3-small')
teacher.load_state_dict(torch.load('../input/transformer-pytorch-native/debertav3-Base_epoch__9.pth'))
teacher.cuda()
teacher.eval()

Some weights of the model checkpoint at ../input/huggingfacedebertav3variants/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification 

ClinicModel(
  (model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropout): S

In [11]:
def custom_loss(logits_tea,logits_stu,loss_ce,alpha=0.5, temperature=2.0):
    #  alpha and temperature can be tuned with optuna   
    # Soften probabilities and compute distillation loss
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    loss_kd = temperature ** 2 * loss_fct(F.log_softmax(logits_stu / temperature, dim=-1),
    F.softmax(logits_tea / temperature, dim=-1))
    # Return weighted student loss
    loss = alpha * loss_ce + (1. - alpha) * loss_kd
    return loss

In [12]:
use_amp = True
debug = False
gc.collect()
best_epoch_loss = np.inf
# train_loader = torch.utils.data.DataLoader(ClinicDataset(train), batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
# val_loader = torch.utils.data.DataLoader(ClinicDataset(valid), batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

net = ClinicModel_student(model_path)
net.cuda()
log_df = pd.DataFrame(columns = ['Epoch','Train_Loss','Valid_Loss','Valid_Accuracy'])

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(net.parameters(), lr = lr)    
param_optimizer = list(net.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                    num_training_steps=num_train_optimization_steps)# PyTorch scheduler
if use_amp:
        scaler = torch.cuda.amp.GradScaler()

for epoch in range(EPOCHS):
    start_time = time.time()
    avg_loss = 0.0
    net.train()
    tbar = tqdm(train_loader, file=sys.stdout)
    loss_list = []
    val_loss_list = []
    for step, data in enumerate(tbar):
        if debug and step == 10:
            print('Debug Mode. Only train on first 100 batches.')
            break
        input_ids = data['ids'].cuda()
        input_masks = data['mask'].cuda()
        targets = data['labels'].long().view(-1).cuda()
        if use_amp:
                with torch.cuda.amp.autocast():
                     pred = net(input_ids,input_masks)
                     pred_tea = teacher(input_ids,input_masks)
                     loss_ce = loss_fn(pred, targets)
                     loss = custom_loss(pred_tea,pred,loss_ce)
                scaler.scale(loss).backward()
                if step % accumulation_steps == 0 or step == len(tbar) - 1:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    scheduler.step()
        else:
            pred = net(input_ids,input_masks)
            loss = loss_fn(pred, targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
       

    loss_list.append(loss.detach().cpu().item())
    avg_loss = np.round(np.mean(loss_list), 4)
    tbar.set_description(f"Epoch {epoch + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")
    vloss,vaccuracy = valid_func(net,val_loader )
    log_df.loc[len(log_df.index)] = [epoch+1,avg_loss,vloss,vaccuracy]
    print(f'Epoch--{epoch+1} ### Train loss---{avg_loss} ### Valid_Loss---{vloss} ### Valid_Acc---{vaccuracy}')
    if vloss<best_epoch_loss:
        best_epoch_loss = vloss
        PATH = f"debertav3-xsmall_epoch__{epoch}.pth"
        torch.save(net.state_dict(), PATH)
        print(f'Model Saved--epoch--{epoch+1}')
        
    
del train_loader
gc.collect()


Some weights of the model checkpoint at ../input/huggingfacedebertav3variants/deberta-v3-xsmall were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'deberta.embeddings.word_embeddings._weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificat

  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--1 ### Train loss---4.0887 ### Valid_Loss---4.126749318899567 ### Valid_Acc---0.18161290322580645
Model Saved--epoch--1


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--2 ### Train loss---3.6775 ### Valid_Loss---2.3666920603550587 ### Valid_Acc---0.6225806451612903
Model Saved--epoch--2


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--3 ### Train loss---2.8576 ### Valid_Loss---1.402904796692514 ### Valid_Acc---0.7893548387096774
Model Saved--epoch--3


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--4 ### Train loss---2.4139 ### Valid_Loss---0.8950232924490246 ### Valid_Acc---0.8616129032258064
Model Saved--epoch--4


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--5 ### Train loss---1.0885 ### Valid_Loss---0.638222834537017 ### Valid_Acc---0.8909677419354839
Model Saved--epoch--5


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--6 ### Train loss---1.0825 ### Valid_Loss---0.5050947372849608 ### Valid_Acc---0.9029032258064517
Model Saved--epoch--6


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--7 ### Train loss---1.4461 ### Valid_Loss---0.4333716676925722 ### Valid_Acc---0.9119354838709678
Model Saved--epoch--7


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--8 ### Train loss---0.9241 ### Valid_Loss---0.391169246583793 ### Valid_Acc---0.9193548387096774
Model Saved--epoch--8


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--9 ### Train loss---0.5999 ### Valid_Loss---0.3639204547375671 ### Valid_Acc---0.9216129032258065
Model Saved--epoch--9


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--10 ### Train loss---0.9345 ### Valid_Loss---0.35315192842222365 ### Valid_Acc---0.922258064516129
Model Saved--epoch--10


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--11 ### Train loss---0.893 ### Valid_Loss---0.33739771050660267 ### Valid_Acc---0.925483870967742
Model Saved--epoch--11


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

Epoch--12 ### Train loss---0.5462 ### Valid_Loss---0.3350447088950458 ### Valid_Acc---0.9258064516129032
Model Saved--epoch--12


282

In [13]:
log_df

Unnamed: 0,Epoch,Train_Loss,Valid_Loss,Valid_Accuracy
0,1.0,4.0887,4.126749,0.181613
1,2.0,3.6775,2.366692,0.622581
2,3.0,2.8576,1.402905,0.789355
3,4.0,2.4139,0.895023,0.861613
4,5.0,1.0885,0.638223,0.890968
5,6.0,1.0825,0.505095,0.902903
6,7.0,1.4461,0.433372,0.911935
7,8.0,0.9241,0.391169,0.919355
8,9.0,0.5999,0.36392,0.921613
9,10.0,0.9345,0.353152,0.922258


In [14]:
def compute_size(model):
    state_dict = model.state_dict()
    tmp_path = Path("model.pt")
    torch.save(state_dict, tmp_path)
    # Calculate size in megabytes
    size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
    # Delete temporary file
    tmp_path.unlink()
    print(f"Model size (MB) - {size_mb:.2f}")

In [15]:
def Benchmark(model,val_loader):
    loss,acc = valid_func(model,val_loader)
    compute_size(model)
    print(f'Loss--{loss}---Acc---{acc}')

In [16]:
Benchmark(teacher,val_loader)

  0%|          | 0/194 [00:00<?, ?it/s]

Model size (MB) - 539.96
Loss--0.20858218174274126---Acc---0.9551612903225807


In [17]:
Benchmark(net,val_loader)

  0%|          | 0/194 [00:00<?, ?it/s]

Model size (MB) - 269.93
Loss--0.3350447088950458---Acc---0.9258064516129032


In [18]:
def valid_func(model,valid_loader):
    model.eval()
    bar = tqdm(valid_loader,file=sys.stdout)
    loss_fn = torch.nn.CrossEntropyLoss()
    PROB = []
    TARGETS = []
    losses = []
    PREDS = []
    start = time.time()
    with torch.no_grad():
        for batch_idx, (data) in enumerate(bar):
            input_ids = data['ids']
            input_masks = data['mask']
            targets = data['labels'].long().view(-1)

            logits = model(input_ids,input_masks)

            PREDS += [torch.argmax(logits, 1).detach().cpu()]
            TARGETS += [targets.detach().cpu()]

            loss = loss_fn (logits, targets)
            losses.append(loss.item())
           
            bar.set_description(f'loss: {loss.item():.5f}')
    end = time.time()

    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    accuracy = (PREDS==TARGETS).mean()
#     print(f'avg_time forsample{}')
    loss_valid = np.mean(losses)
    return loss_valid, accuracy


In [19]:
del val_loader 
val_loader = torch.utils.data.DataLoader(ClinicDatasetV2(val_encodings, val_labels),batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [20]:
del teacher
teacher = ClinicModel('../input/huggingfacedebertav3variants/deberta-v3-small')
teacher.load_state_dict(torch.load('../input/transformer-pytorch-native/debertav3-Base_epoch__9.pth'))
model_quantized = quantize_dynamic(teacher, {nn.Linear}, dtype=torch.qint8)
Benchmark(model_quantized,val_loader)

Some weights of the model checkpoint at ../input/huggingfacedebertav3variants/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification 

  0%|          | 0/194 [00:00<?, ?it/s]

Model size (MB) - 418.16
Loss--2.0788149902040196---Acc---0.49483870967741933


In [21]:
# Try with optuna to get optimal alpha and temperature 