# Import

In [1]:
import transformers
import torch
import torch.nn as nn
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from datasets import load_dataset

In [4]:
from constants import MODEL_NAME, DATASET_URI, BATCH_SIZE, RANK, ALPHA

In [15]:
from torch.utils.data import DataLoader, Dataset

# dataset

In [6]:
dataset = load_dataset('dair-ai/emotion')

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [10]:
# Tokenize the dataset
def tokenize_function(data):
    return tokenizer(data['text'], padding="max_length", truncation=True, max_length=128)


In [11]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 16000/16000 [00:06<00:00, 2288.60 examples/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1759.24 examples/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1714.04 examples/s]


In [22]:

train_dataset = tokenized_dataset['train'].shuffle(seed=42).select(range(4000))
val_dataset = tokenized_dataset['validation'].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_dataset['test'].shuffle(seed=42).select(range(1000))

In [13]:
def collate_data(batch):
    input_ids = torch.tensor([data['input_ids'] for data in batch])
    mask = torch.tensor([data['attention_mask'] for data in batch])
    labels = torch.tensor([data['label'] for data in batch])
    return input_ids, mask,labels

In [16]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle= True, collate_fn= collate_data)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn= collate_data)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn= collate_data)

In [17]:
for data in iter(train_loader):
    print(len(data),data)
    # print(data.keys())
    input_id, mask, label = data
    print(len(input_id), len(input_id[1]),len(label))
    break

3 (tensor([[ 101, 1045, 2514,  ...,    0,    0,    0],
        [ 101, 1045, 2424,  ...,    0,    0,    0],
        [ 101, 1045, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2572,  ...,    0,    0,    0],
        [ 101, 1045, 2514,  ...,    0,    0,    0],
        [ 101, 1045, 4299,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([3, 1, 0, 1, 5, 4, 0, 0, 0, 2]))
10 128 10


# Define lora model

- define Lora layer () 
    - init() 
        - initilize rank and alpha
        - Init A = N(0,1) [d*r] and B = 0 of size [r*k]
        - scale = alpha/rank
    - define forward() function
        calc W +  scale * matmul (A,B)

- define LoraModel ()
    for layer in model.layers
        layer = LoraLayer(layer)

In [18]:
from constants import RANK, ALPHA

In [19]:
class LoraLayer(nn.Module):
    def __init__(self,lin_layer, rank = RANK, alpha = ALPHA):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        in_feature,out_feature = lin_layer.in_features, lin_layer.out_features
        self.A = nn.Parameter(torch.zeros(in_feature, rank)).to(self.device)
        nn.init.normal_(self.A, mean=0, std=1)
        self.B = nn.Parameter(torch.zeros(rank, out_feature)).to(self.device)
        self.scale = alpha / rank
        self.W = lin_layer.to(self.device)

    def forward(self,x):
        return self.W(x) + self.scale * (torch.matmul(torch.matmul(x, self.A), self.B))
    

# Lora model

In [21]:

from tqdm import tqdm
from torch.optim import Adam 
from torch.optim.lr_scheduler import  ReduceLROnPlateau

In [65]:
class LoraModel():
    def __init__(self, apply_lora = True):
        self.base_model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 6)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

        for name,param in self.base_model.named_parameters():
            if 'attention' in name:
                print(name)
                param.requires_grad = False 

        if apply_lora:
            self.__apply_lora()

    def __apply_lora(self):
        for block in self.base_model.distilbert.transformer.layer: 
            block.attention.q_lin = LoraLayer(block.attention.q_lin)
            block.attention.v_lin = LoraLayer(block.attention.v_lin)
            block.attention.k_lin = LoraLayer(block.attention.k_lin)
            block.attention.out_lin = LoraLayer(block.attention.out_lin)

    def train(self,train_loader, val_loader, lr = 1e-5, num_epochs = 10):
        optimizer = Adam(self.base_model.parameters(), lr= lr)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)
        
        self.base_model.train()
        for i in range(num_epochs):
            correct_pred, train_loss,total_train = 0,0,0  
            
            for step,batch in enumerate(tqdm(train_loader)): 
                input_ids,mask,labels = batch 
                outputs = self.base_model(input_ids, attention_mask = mask, labels = labels)
 
                loss = outputs.loss
                loss.backward()
                optimizer.step() 

                predictions = torch.max(outputs.logits,dim = -1).indices 
                correct_pred += (predictions == labels).sum().item()
                train_loss += loss.item()
                total_train += len(labels) 

                # Update progress bar every 40 steps
                if step % 40 == 0: 
                    print('Step:',step,'\n [TRAIN] Loss:',train_loss/total_train, 'accuracy:',correct_pred/total_train)            
                    # val_loss_avg, val_acc_avg = self.predict(val_loader)
                    # print(f'[VAL] Loss: {val_loss_avg}, Accuracy: {val_acc_avg}')
                    # scheduler.step(val_loss_avg)

            scheduler.step(val_loss_avg)
            train_loss_avg = train_loss/total_train
            train_acc_avg = correct_pred/total_train
            print(f'[TRAIN] Epoch {i+1} Loss: {train_loss_avg}, Accuracy: {train_acc_avg}')

            val_loss_avg, val_acc_avg = self.predict(val_loader)
            print(f'[VAL] Epoch {i+1} Loss: {val_loss_avg}, Accuracy: {val_acc_avg}')

            scheduler.step(val_loss_avg)


    def predict(self,data_loader):
        self.base_model.eval()
        correct_pred, val_test_loss, total_data = 0,0,0

        with torch.no_grad():
            for batch in data_loader:
                input_ids, mask, labels = batch 
                output = self.base_model(input_ids, attention_mask = mask, labels = labels)
                val_test_loss += output.loss.item() 
                prediction = torch.argmax(output.logits , dim = -1) 
                correct_pred += (prediction == labels).sum().item()
                total_data += len(labels)

        avg_loss = val_test_loss/total_data
        avg_acc = correct_pred/total_data
        return avg_loss,avg_acc
    

# Inference

In [66]:
bert_model = LoraModel(apply_lora=False)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.1.attention.q_lin.weight
distilbert.transformer.layer.1.attention.q_lin.bias
distilbert.transformer.layer.1.attention.k_lin.weight
distilbert.transformer.layer.1.attention.k_lin.bias
distilbert.transformer.layer.1.attention.v_lin.weight
distilbert.transformer.layer.1.attention.v_lin.bias
distilbert.transformer.layer.1.attention.out_lin.weight
distilbert.transformer.layer.1.attention.out_lin.bias
distilbert.transformer.layer.2.attention.q_lin.weight
distilbert.transformer.layer.2.attention.q_lin.bias
distilbert.transformer.layer.2.attenti

In [None]:
avg_loss,avg_acc = bert_model.predict(test_loader)

In [53]:
print('accuracy_score', avg_acc*100)

accuracy_score 6.2


# FIne tune

In [64]:
lora_model = LoraModel()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.0.sa_layer_norm.weight
distilbert.transformer.layer.0.sa_layer_norm.bias
distilbert.transformer.layer.0.ffn.lin1.weight
distilbert.transformer.layer.0.ffn.lin1.bias
distilbert.transformer.layer.0.ffn.lin2.weight
distilbert.transformer.layer.0.ffn.lin2.bias
distilbert.transformer.layer.0.output_layer_norm.weight
distilbert.transformer.layer.0.output_layer_norm.bias
distil

In [62]:
lora_model.train(train_loader, val_loader, lr = 1e-3, num_epochs = 3 )

  0%|          | 1/400 [00:01<09:08,  1.37s/it]

Step: 0 
 [TRAIN] Loss: 0.1876000165939331 accuracy: 0.1


 10%|█         | 41/400 [01:15<12:41,  2.12s/it]

Step: 40 
 [TRAIN] Loss: 0.17714196995991033 accuracy: 0.1975609756097561


 13%|█▎        | 51/400 [01:40<11:30,  1.98s/it]


KeyboardInterrupt: 

In [60]:
# lora_model.train(train_loader, val_loader, lr = 1e-4, num_epochs = 1 )



Loss: 0.17254927158355712 accuracy: 0.4




Loss: 0.1752468847093128 accuracy: 0.30952380952380953




Loss: 0.17038408256158596 accuracy: 0.32682926829268294




Loss: 0.16696654538639255 accuracy: 0.3081967213114754




Loss: 0.16445770234237483 accuracy: 0.3074074074074074




Loss: 0.16225004207969893 accuracy: 0.3207920792079208




Loss: 0.16097905576721697 accuracy: 0.3231404958677686




Loss: 0.16117140866340474 accuracy: 0.3191489361702128




Loss: 0.1608036308555129 accuracy: 0.3167701863354037




Loss: 0.16108026873340922 accuracy: 0.318232044198895




Loss: 0.16123788054309673 accuracy: 0.3144278606965174




Loss: 0.16131286842251255 accuracy: 0.318552036199095




Loss: 0.16109248038644117 accuracy: 0.3224066390041494




Loss: 0.16095722269737858 accuracy: 0.3245210727969349




Loss: 0.16058310715753413 accuracy: 0.32669039145907475




Loss: 0.1605042103121051 accuracy: 0.3289036544850498




Loss: 0.16046274094566748 accuracy: 0.3283489096573209




Loss: 0.1602448949366371 accuracy: 0.330791788856305




Loss: 0.16062685122450304 accuracy: 0.328808864265928


KeyboardInterrupt: 