In [1]:
import gc
import os
import sys
import time
import pickle
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
import math
import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import AutoConfig
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import warnings
warnings.simplefilter('ignore')

In [2]:
! nvidia-smi

Sat Feb 25 10:29:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
EPOCHS = 12
lr = 2e-5
SEED = 42
MAX_LEN = 128
BATCH_SIZE = 24
accumulation_steps = 4
seed_everything(SEED)

In [6]:
data_path = '../input/data-for-distilation' 
train = pd.read_csv('../input/data-for-distilation/Clinc_Train.csv')
valid = pd.read_csv('../input/data-for-distilation/Clinc_valid.csv')
n_classes = np.unique(train.Target).shape[0]
train.head(2)

Unnamed: 0,Text,Target,intent
0,what expression would i use to say i love you ...,61,translate
1,can you tell me how to say 'i do not speak muc...,61,translate


In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self,config,device):
        super(MultiHeadAttention,self).__init__()
        self.n_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_dim = self.hidden_size//self.n_heads
        self.q = nn.Linear(self.hidden_size,self.hidden_size)
        self.k = nn.Linear(self.hidden_size,self.hidden_size)
        self.v = nn.Linear(self.hidden_size,self.hidden_size)
        self.device = device

        self.fc = nn.Linear(self.hidden_size,self.hidden_size)
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(self.device)

    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0]
        Q = self.q(query)
        K = self.k(key)
        V = self.v(value)
        # [batch size, query len, hid dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        # [batch size, n heads, query len, head dim]
        score = torch.matmul(Q, K.permute(0, 1, 3, 2)) /self.scale
        if mask is not None:
            score = score.masked_fill(mask == 0, -1e10)
        attention = torch.softmax(score, dim = -1)
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hidden_size)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc(x)
        return x, attention
    
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self,config):
        super(PositionwiseFeedforwardLayer,self).__init__()
        self.pf_dim = config.intermediate_size
        self.hid_dim = config.hidden_size
        self.fc_1 = nn.Linear(self.hid_dim, self.pf_dim)
        self.fc_2 = nn.Linear(self.pf_dim, self.hid_dim)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(nn.GELU()(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x
    
class EncoderLayer(nn.Module):
    def __init__(self,config,device):
        super(EncoderLayer,self).__init__()
        self.hid_dim = config.hidden_size
        self.self_attn_layer_norm = nn.LayerNorm(self.hid_dim)
        self.ff_layer_norm = nn.LayerNorm(self.hid_dim)
        self.device = device
        self.self_attention =  MultiHeadAttention(config, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    def forward(self, x,mask):
        
        #x = [batch size, x len, hid dim]
        #x_mask = [batch size, 1, 1, x len] 
                
        #self attention
        _x, _ = self.self_attention(x,x,x,mask)
        
        #dropout, residual connection and layer norm
        x = self.self_attn_layer_norm(x + self.dropout(_x))
        
        #x = [batch size, x len, hid dim]
        
        #positionwise feedforward
        _x = self.positionwise_feedforward(_x)
        
        #dropout, residual and layer norm
        x = self.ff_layer_norm(x + self.dropout(_x))
        
        #x = [batch size, x len, hid dim]
        
        return x

class Encoder(nn.Module):
    def __init__(self,config,device):
        super(Encoder,self).__init__()
        self.vocab = config.vocab_size
        self.hid_dim = config.hidden_size
        self.max_length = config.max_position_embeddings
        self.tok_embedding = nn.Embedding(self.vocab, self.hid_dim)
        self.pos_embedding = nn.Embedding(self.max_length, self.hid_dim)
        self.n_layers = config.num_hidden_layers
        self.layers = nn.ModuleList([EncoderLayer(config,device) 
                                     for _ in range(self.n_layers)])
        
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        self.scale = torch.sqrt(torch.FloatTensor([config.hidden_size])).to(device)
        self.device = device
    def forward(self, x,mask):
        
        #x = [batch size, x len]
        #x_mask = [batch size, 1, 1, x len]
        
        batch_size = x.shape[0]
        x_len = x.shape[1]
        
        pos = torch.arange(0, x_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, x len]
        
        x = self.dropout((self.tok_embedding(x) * self.scale) + self.pos_embedding(pos))
        
        #x = [batch size, x len, hid dim]
        mask = mask.unsqueeze(1).unsqueeze(2)
        # mask = [batch size, 1, 1, x len]
        for layer in self.layers:
            x = layer(x, mask)
            
        #x = [batch size, x len, hid dim]
            
        return x

In [8]:
model_ckpt = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [9]:
class callback:
    def __init__(self):
        self.loss = list()
        self.model = list()
    
    def put(self, model, loss):
        self.loss.append(loss)
        self.model.append(model)

    def get_model(self):
        ind = np.argmin(self.loss)
        return self.model[ind]

class ClinicModel(nn.Module):
    def __init__(self, config,device):
        super(ClinicModel, self).__init__()
        self.model = Encoder(config,device)
        self.output = nn.Linear(768, n_classes)

    def forward(self, ids, mask):
        sequence_output = self.model(ids, mask)[:, 0, :]
        logits = self.output(sequence_output)
        return logits


class ClinicDataset(Dataset):
    def __init__(self, data,is_test=False):
        self.X = data['Text'].values
        self.Y = data['Target'].values
        self.is_test = is_test
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.MAX_LEN = 128
        
    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(self.X[idx],
            add_special_tokens=True,
            truncation=True,
            max_length=self.MAX_LEN
        )['input_ids'] 

        if not self.is_test:
            target_value = self.Y[idx]
      
        mask = [1]*len(inputs) + [0] * (self.MAX_LEN - len(inputs)) 
        mask = torch.tensor(mask, dtype=torch.long)
        
        if len(inputs) != self.MAX_LEN:
            inputs = inputs + [self.tokenizer.pad_token_id] * (self.MAX_LEN - len(inputs)) 
        ids = torch.tensor(inputs, dtype=torch.long)
        
        
        
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
            }
        
        else:
            targets = torch.FloatTensor(target_value)
            return {
                'ids': ids,
                'mask': mask,
                'targets': targets
            }
        
    def __len__(self):
        return len(self.Y)

In [10]:
train_texts = train['Text'].values.tolist()
val_texts = valid['Text'].values.tolist()
train_labels = train['Target'].values.tolist()
val_labels = valid['Target'].values.tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class ClinicDatasetV2(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return {'ids':item.get('input_ids'),'mask':item.get('attention_mask'),'labels':item.get('labels')}

    def __len__(self):
        return len(self.labels)

train_loader = torch.utils.data.DataLoader(ClinicDatasetV2(train_encodings, train_labels),batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(ClinicDatasetV2(val_encodings, val_labels),batch_size=BATCH_SIZE, shuffle=False, num_workers=2)



In [11]:
def valid_func(model,valid_loader):
    model.eval()
    bar = tqdm(valid_loader,file=sys.stdout)
    loss_fn = torch.nn.CrossEntropyLoss()
    PROB = []
    TARGETS = []
    losses = []
    PREDS = []

    with torch.no_grad():
        for batch_idx, (data) in enumerate(bar):
            input_ids = data['ids'].cuda()
            input_masks = data['mask'].cuda()
            targets = data['labels'].long().view(-1).cuda()

            logits = model(input_ids,input_masks)

            PREDS += [torch.argmax(logits, 1).detach().cpu()]
            TARGETS += [targets.detach().cpu()]

            loss = loss_fn (logits, targets)
            losses.append(loss.item())
           
            bar.set_description(f'loss: {loss.item():.5f}')

    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    accuracy = (PREDS==TARGETS).mean()
   
    loss_valid = np.mean(losses)
    return loss_valid, accuracy


In [12]:
use_amp = True
debug = False
gc.collect()
best_epoch_loss = np.inf

net = ClinicModel(config,device)
net.cuda()
log_df = pd.DataFrame(columns = ['Epoch','Train_Loss','Valid_Loss','Valid_Accuracy'])

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(net.parameters(), lr = lr)    
num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                    num_training_steps=num_train_optimization_steps)# PyTorch scheduler
if use_amp:
        scaler = torch.cuda.amp.GradScaler()
for epoch in range(EPOCHS):
    start_time = time.time()
    avg_loss = 0.0
    net.train()
    tbar = tqdm(train_loader, file=sys.stdout)
    loss_list = []
    val_loss_list = []
    for step, data in enumerate(tbar):
        if debug and step == 10:
            print('Debug Mode. Only train on first 100 batches.')
            break
        input_ids = data['ids'].cuda()
        input_masks = data['mask'].cuda()
        targets = data['labels'].long().view(-1).cuda()
        if use_amp:
                with torch.cuda.amp.autocast():
                     pred = net(input_ids,input_masks)
                     loss = loss_fn(pred, targets)
                scaler.scale(loss).backward()
                if step % accumulation_steps == 0 or step == len(tbar) - 1:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    scheduler.step()
        else:
            pred = net(input_ids,input_masks)
            loss = loss_fn(pred, targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
       

    loss_list.append(loss.detach().cpu().item())
    avg_loss = np.round(np.mean(loss_list), 4)
    tbar.set_description(f"Epoch {epoch + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")
    vloss,vaccuracy = valid_func(net,val_loader )
    log_df.loc[len(log_df.index)] = [epoch+1,avg_loss,vloss,vaccuracy]
    print(f'Epoch--{epoch+1} ### Train loss---{avg_loss} ### Valid_Loss---{vloss} ### Valid_Acc---{vaccuracy}')
    if vloss<best_epoch_loss:
        best_epoch_loss = vloss
        PATH = f"debertav3-Base_epoch__{epoch}.pth"
        torch.save(net.state_dict(), PATH)
        print(f'Model Saved--epoch--{epoch+1}')
        
    
del train_loader
del net
del val_loader
gc.collect()


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--1 ### Train loss---3.1896 ### Valid_Loss---3.2159992612325228 ### Valid_Acc---0.42129032258064514
Model Saved--epoch--1


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--2 ### Train loss---1.4775 ### Valid_Loss---1.969340900045175 ### Valid_Acc---0.632258064516129
Model Saved--epoch--2


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--3 ### Train loss---1.8931 ### Valid_Loss---1.4984107043880683 ### Valid_Acc---0.7090322580645161
Model Saved--epoch--3


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--4 ### Train loss---1.4479 ### Valid_Loss---1.2553657522568336 ### Valid_Acc---0.75
Model Saved--epoch--4


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--5 ### Train loss---0.733 ### Valid_Loss---1.101968218271549 ### Valid_Acc---0.7719354838709678
Model Saved--epoch--5


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--6 ### Train loss---0.8781 ### Valid_Loss---1.0187745079111594 ### Valid_Acc---0.7883870967741935
Model Saved--epoch--6


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--7 ### Train loss---0.5984 ### Valid_Loss---0.9503974425391509 ### Valid_Acc---0.8038709677419354
Model Saved--epoch--7


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--8 ### Train loss---0.3018 ### Valid_Loss---0.9061386542824599 ### Valid_Acc---0.8067741935483871
Model Saved--epoch--8


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--9 ### Train loss---0.9724 ### Valid_Loss---0.8830602332089956 ### Valid_Acc---0.8096774193548387
Model Saved--epoch--9


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--10 ### Train loss---0.6607 ### Valid_Loss---0.8642050722184089 ### Valid_Acc---0.8103225806451613
Model Saved--epoch--10


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--11 ### Train loss---0.6467 ### Valid_Loss---0.8601169919308561 ### Valid_Acc---0.8145161290322581
Model Saved--epoch--11


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--12 ### Train loss---0.3309 ### Valid_Loss---0.8474599173005957 ### Valid_Acc---0.8154838709677419
Model Saved--epoch--12


164