In [1]:
import gc
import os
import sys
import time
import pickle
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
import math
import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import AutoConfig
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import warnings
warnings.simplefilter('ignore')

In [2]:
! nvidia-smi

Sat Apr 15 11:55:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
EPOCHS = 8
lr = 2e-5
SEED = 42
MAX_LEN = 128
BATCH_SIZE = 24
accumulation_steps = 4
seed_everything(SEED)

In [6]:
data_path = '../input/data-for-distilation' 
train = pd.read_csv('../input/data-for-distilation/Clinc_Train.csv')
valid = pd.read_csv('../input/data-for-distilation/Clinc_valid.csv')
n_classes = np.unique(train.Target).shape[0]
train.head(2)

Unnamed: 0,Text,Target,intent
0,what expression would i use to say i love you ...,61,translate
1,can you tell me how to say 'i do not speak muc...,61,translate


In [7]:
model_ckpt = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [8]:
class callback:
    def __init__(self):
        self.loss = list()
        self.model = list()
    
    def put(self, model, loss):
        self.loss.append(loss)
        self.model.append(model)

    def get_model(self):
        ind = np.argmin(self.loss)
        return self.model[ind]

class ClinicModel(nn.Module):
    def __init__(self):
        super(ClinicModel, self).__init__()
        self.model = AutoModel.from_pretrained('../input/transformer-distilation-pre/roberta_base_6layers_student')
        self.output = nn.Linear(768, n_classes)
    def forward(self, ids, mask):
        sequence_output = self.model(ids, mask)['last_hidden_state'][:,0,:]
        logits = self.output(sequence_output)
        return logits


class ClinicDataset(Dataset):
    def __init__(self, data,is_test=False):
        self.X = data['Text'].values
        self.Y = data['Target'].values
        self.is_test = is_test
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.MAX_LEN = 128
        
    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(self.X[idx],
            add_special_tokens=True,
            truncation=True,
            max_length=self.MAX_LEN
        )['input_ids'] 

        if not self.is_test:
            target_value = self.Y[idx]
      
        mask = [1]*len(inputs) + [0] * (self.MAX_LEN - len(inputs)) 
        mask = torch.tensor(mask, dtype=torch.long)
        
        if len(inputs) != self.MAX_LEN:
            inputs = inputs + [self.tokenizer.pad_token_id] * (self.MAX_LEN - len(inputs)) 
        ids = torch.tensor(inputs, dtype=torch.long)
        
        
        
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
            }
        
        else:
            targets = torch.FloatTensor(target_value)
            return {
                'ids': ids,
                'mask': mask,
                'targets': targets
            }
        
    def __len__(self):
        return len(self.Y)

In [9]:
train_texts = train['Text'].values.tolist()
val_texts = valid['Text'].values.tolist()
train_labels = train['Target'].values.tolist()
val_labels = valid['Target'].values.tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class ClinicDatasetV2(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return {'ids':item.get('input_ids'),'mask':item.get('attention_mask'),'labels':item.get('labels')}

    def __len__(self):
        return len(self.labels)

train_loader = torch.utils.data.DataLoader(ClinicDatasetV2(train_encodings, train_labels),batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(ClinicDatasetV2(val_encodings, val_labels),batch_size=BATCH_SIZE, shuffle=False, num_workers=2)



In [10]:
def valid_func(model,valid_loader):
    model.eval()
    bar = tqdm(valid_loader,file=sys.stdout)
    loss_fn = torch.nn.CrossEntropyLoss()
    PROB = []
    TARGETS = []
    losses = []
    PREDS = []

    with torch.no_grad():
        for batch_idx, (data) in enumerate(bar):
            input_ids = data['ids'].cuda()
            input_masks = data['mask'].cuda()
            targets = data['labels'].long().view(-1).cuda()

            logits = model(input_ids,input_masks)

            PREDS += [torch.argmax(logits, 1).detach().cpu()]
            TARGETS += [targets.detach().cpu()]

            loss = loss_fn (logits, targets)
            losses.append(loss.item())
           
            bar.set_description(f'loss: {loss.item():.5f}')

    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    accuracy = (PREDS==TARGETS).mean()
   
    loss_valid = np.mean(losses)
    return loss_valid, accuracy


In [11]:
use_amp = True
debug = False
gc.collect()
best_epoch_loss = np.inf

net = ClinicModel()
net.cuda()
log_df = pd.DataFrame(columns = ['Epoch','Train_Loss','Valid_Loss','Valid_Accuracy'])

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(net.parameters(), lr = lr)    
num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                    num_training_steps=num_train_optimization_steps)# PyTorch scheduler
if use_amp:
        scaler = torch.cuda.amp.GradScaler()
for epoch in range(EPOCHS):
    start_time = time.time()
    avg_loss = 0.0
    net.train()
    tbar = tqdm(train_loader, file=sys.stdout)
    loss_list = []
    val_loss_list = []
    for step, data in enumerate(tbar):
        if debug and step == 10:
            print('Debug Mode. Only train on first 100 batches.')
            break
        input_ids = data['ids'].cuda()
        input_masks = data['mask'].cuda()
        targets = data['labels'].long().view(-1).cuda()
        if use_amp:
                with torch.cuda.amp.autocast():
                     pred = net(input_ids,input_masks)
                     loss = loss_fn(pred, targets)
                scaler.scale(loss).backward()
                if step % accumulation_steps == 0 or step == len(tbar) - 1:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    scheduler.step()
        else:
            pred = net(input_ids,input_masks)
            loss = loss_fn(pred, targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
       

    loss_list.append(loss.detach().cpu().item())
    avg_loss = np.round(np.mean(loss_list), 4)
    tbar.set_description(f"Epoch {epoch + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")
    vloss,vaccuracy = valid_func(net,val_loader )
    log_df.loc[len(log_df.index)] = [epoch+1,avg_loss,vloss,vaccuracy]
    print(f'Epoch--{epoch+1} ### Train loss---{avg_loss} ### Valid_Loss---{vloss} ### Valid_Acc---{vaccuracy}')
    if vloss<best_epoch_loss:
        best_epoch_loss = vloss
        PATH = f"debertav3-Base_epoch__{epoch}.pth"
        torch.save(net.state_dict(), PATH)
        print(f'Model Saved--epoch--{epoch+1}')
        
    
del train_loader
del net
del val_loader
gc.collect()


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--1 ### Train loss---2.7126 ### Valid_Loss---2.2395440028263973 ### Valid_Acc---0.6961290322580646
Model Saved--epoch--1


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--2 ### Train loss---1.3182 ### Valid_Loss---0.933498174066727 ### Valid_Acc---0.8516129032258064
Model Saved--epoch--2


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--3 ### Train loss---0.9175 ### Valid_Loss---0.6096371703136426 ### Valid_Acc---0.8896774193548387
Model Saved--epoch--3


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--4 ### Train loss---0.2812 ### Valid_Loss---0.48017742964797294 ### Valid_Acc---0.9087096774193548
Model Saved--epoch--4


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--5 ### Train loss---0.6795 ### Valid_Loss---0.4169415351003408 ### Valid_Acc---0.9167741935483871
Model Saved--epoch--5


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--6 ### Train loss---0.1449 ### Valid_Loss---0.37303138974194344 ### Valid_Acc---0.9238709677419354
Model Saved--epoch--6


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--7 ### Train loss---0.2008 ### Valid_Loss---0.3596781362492878 ### Valid_Acc---0.9261290322580645
Model Saved--epoch--7


  0%|          | 0/636 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch--8 ### Train loss---0.1486 ### Valid_Loss---0.3549013801277257 ### Valid_Acc---0.9270967741935484
Model Saved--epoch--8


176

In [12]:
# model = AutoModel.from_pretrained('../input/transformer-distilation-pre/roberta_base_6layers_student')
# model.cuda()

In [13]:
# x = model(input_ids,input_masks)

In [14]:
# x['last_hidden_state'][:,0,:].size()