In [14]:
import os
import sys

import numpy as np

from finetuning.bert_masking import BertPreTrainedModel_masking
%load_ext autoreload
%autoreload 2

import tqdm
import math
import numpy as np

from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, BertConfig, AutoModelForCausalLM, BertPreTrainedModel
from torch.optim import AdamW
from transformers import get_scheduler
from peft import LoraConfig, PeftModel

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import loralib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# setting device to `cuda` if gpu exists
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# initialising the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
bert = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2", device_map="auto")
# bert = BertPreTrainedModel_masking.from_pretrained("google/bert_uncased_L-2_H-128_A-2", device_map="auto")

# Define LoRA parameters
rank = 8
num_adapters = 2

def tokenize_function(examples):
    '''Function for tokenizing raw texts'''
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# downloading IMDB dataset from 🤗 `datasets`
raw_datasets = load_dataset("imdb")

# Running tokenizing function on the raw texts
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# for simplicity I have taken only the train split
tokenized_datasets = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))



In [3]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-1): 2 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=128, out_features=128, bias=True)
            (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [3]:
# Now lets create the torch Dataset class
class IMDBClassificationDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        d = self.dataset[idx]

        ids = torch.tensor(d['input_ids'])
        mask = torch.tensor(d['attention_mask'])
        label = torch.tensor(d['label'])
        return ids, mask, label
    
    
# Preparing the dataset and the Dataloader
dataset = IMDBClassificationDataset(tokenized_datasets)
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=8)

In [26]:
class Adapter(nn.Linear, loralib.LoRALayer):
    # LoRA implemented in a dense layer
    def __init__(
        self,
        linear: nn.Linear,
        in_features: int, 
        out_features: int, 
        r: int = 0, 
        lora_alpha: int = 1, 
        lora_dropout: float = 0.,
        fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        merge_weights: bool = True,
        **kwargs
    ):
        nn.Linear.__init__(self, in_features, out_features, bias=False, **kwargs)
        self.linear = linear
        loralib.LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
                           merge_weights=merge_weights)

        self.fan_in_fan_out = fan_in_fan_out
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
        self.reset_parameters()
        if fan_in_fan_out:
            self.weight.data = self.weight.data.transpose(0, 1)

    def reset_parameters(self):
        if hasattr(self, 'lora_A'):
            # initialize A the same way as the default for nn.Linear and B to zero
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)

    def train(self, mode: bool = True):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w
        
        self.linear.train(mode)
        
        if mode:
            if self.merge_weights and self.merged:
                # Make sure that the weights are not merged
                if self.r > 0:
                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
                self.merged = False
        else:
            if self.merge_weights and not self.merged:
                # Merge the weights and mark it
                if self.r > 0:
                    self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
                self.merged = True   

    def forward(self, x: torch.Tensor):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w
        if self.r > 0 and not self.merged:
            result = self.linear(x)            
            result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
            return result
        else:
            return self.linear(x)

In [45]:
class CustomBert(transformers.PreTrainedModel):
    def __init__(self, bert):
        super(CustomBert, self).__init__(config=BertConfig.from_pretrained('google/bert_uncased_L-2_H-128_A-2'))
        self.bert = bert
        self.l1 = nn.Linear(128, 1)
        self.do = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
        # Add LoRA layers to the BERT model
        for i, (name, module) in enumerate(self.bert.named_modules()):
            if isinstance(module, nn.Linear) and "encoder" in name and "attention" in name:
                idx = int(name.split(".")[2])
                if "query" in name:
                    self.bert.encoder.layer[idx].attention.self.query = Adapter(module, module.in_features, module.out_features, r=8, lora_alpha=32, lora_dropout=0.1)
                elif "key" in name:
                    self.bert.encoder.layer[idx].attention.self.key = Adapter(module, module.in_features, module.out_features, r=8, lora_alpha=32, lora_dropout=0.1)  
                elif "value" in name:
                    self.bert.encoder.layer[idx].attention.self.value = Adapter(module, module.in_features, module.out_features, r=8, lora_alpha=32, lora_dropout=0.1)

    def forward(self, sent_id, mask):
        '''For simplicity I have added only one linear layer, you can create any type of network you want'''
        
        bert_out = self.bert(sent_id, attention_mask=mask)
        o = bert_out.last_hidden_state[:,0,:]
        o = self.do(o)
        o = self.relu(o)
        o = self.l1(o)
        o = self.sigmoid(o)
        return o

In [51]:
class ParallelTrainer:
    def __init__(self, base_model, device, num_adapters=2, verbose=False):
        self.base_model = base_model
        self.device = device
        self.num_adapters = num_adapters
        
        # Add LoRA layers to the BERT model
        self.models = []
        
        for _ in range(num_adapters):
            model = CustomBert(bert).to(device)
            loralib.utils.mark_only_lora_as_trainable(model)
            
            if verbose:
                # print the trainable parameters
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        print(name)
                        
            self.models.append(model)                
    
    def train(self, train_dataloader, num_epochs=3):
        for model in self.models:
            model.train()
            criterion = torch.nn.BCELoss()
            optimizer = AdamW(model.parameters(), lr=5e-5)
            num_training_steps = num_epochs * len(train_dataloader)
            lr_scheduler = get_scheduler(
                "linear",
                optimizer=optimizer,
                num_warmup_steps=0,
                num_training_steps=num_training_steps
            )

            for epoch in tqdm.tqdm(range(num_epochs)):
                for batch in train_dataloader:
                    ids, masks, labels = batch
                    labels = labels.type(torch.float32)
                    o = model(ids.to(device), masks.to(device))
                    loss = criterion(torch.squeeze(o), labels.to(device))
                    loss.backward()

                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()      

In [56]:
def print_size_of_model(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    size_all_mb = (param_size + buffer_size) / 1024**2
    print('model size: {:.3f}MB'.format(size_all_mb))

In [52]:
# Initialize the parallel trainer
trainer = ParallelTrainer(bert, device, num_adapters=2, verbose=True)

bert.encoder.layer.0.attention.self.query.lora_A
bert.encoder.layer.0.attention.self.query.lora_B
bert.encoder.layer.0.attention.self.key.lora_A
bert.encoder.layer.0.attention.self.key.lora_B
bert.encoder.layer.0.attention.self.value.lora_A
bert.encoder.layer.0.attention.self.value.lora_B
bert.encoder.layer.1.attention.self.query.lora_A
bert.encoder.layer.1.attention.self.query.lora_B
bert.encoder.layer.1.attention.self.key.lora_A
bert.encoder.layer.1.attention.self.key.lora_B
bert.encoder.layer.1.attention.self.value.lora_A
bert.encoder.layer.1.attention.self.value.lora_B
bert.encoder.layer.0.attention.self.query.lora_A
bert.encoder.layer.0.attention.self.query.lora_B
bert.encoder.layer.0.attention.self.key.lora_A
bert.encoder.layer.0.attention.self.key.lora_B
bert.encoder.layer.0.attention.self.value.lora_A
bert.encoder.layer.0.attention.self.value.lora_B
bert.encoder.layer.1.attention.self.query.lora_A
bert.encoder.layer.1.attention.self.query.lora_B
bert.encoder.layer.1.attention.s

In [53]:
trainer.train(train_dataloader, num_epochs=1)

100%|██████████| 1/1 [00:01<00:00,  1.36s/it]
100%|██████████| 1/1 [00:00<00:00,  1.10it/s]


In [59]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-1): 2 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Adapter(
              in_features=128, out_features=128, bias=False
              (linear): Linear(in_features=128, out_features=128, bias=True)
              (lora_dropout): Dropout(p=0.1, inplace=False)
            )
            (key): Adapter(
              in_features=128, out_features=128, bias=False
              (linear): Linear(in_features=128, out_features=128, bias=True)
              (lora_dropout): Dropout(p=0.1, inplace=False)
            )
            (value): Adapter(
              in_

In [58]:
# initialising model, loss and optimizer
model = CustomBert(bert)
model

CustomBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Adapter(
                in_features=128, out_features=128, bias=False
                (linear): Linear(in_features=128, out_features=128, bias=True)
                (lora_dropout): Dropout(p=0.1, inplace=False)
              )
              (key): Adapter(
                in_features=128, out_features=128, bias=False
                (linear): Linear(in_features=128, out_features=128, bias=True)
                (lora_dropout): Dropout(p=0.1, inplace=Fals

In [29]:
model.to(device)
criterion = torch.nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

# setting epochs, num_training_steps and the lr_scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# training loop
model.train()
for epoch in tqdm.tqdm(range(num_epochs)):
    for batch in train_dataloader:
        ids, masks, labels = batch
        labels = labels.type(torch.float32)
        o = model(ids.to(device), masks.to(device))
        loss = criterion(torch.squeeze(o), labels.to(device))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

100%|██████████| 1/1 [00:01<00:00,  1.72s/it]


# Parallel
---
# Masking

In [114]:
# setting device to `cuda` if gpu exists
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# initialising the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
bert = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2", device_map="auto")
# bert = BertPreTrainedModel_masking.from_pretrained("google/bert_uncased_L-2_H-128_A-2", device_map="auto")

# Define LoRA parameters
rank = 8
num_adapters = 2

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

raw_datasets = load_dataset("imdb")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))

# Now lets create the torch Dataset class
class IMDBClassificationDataset(Dataset):
    def __init__(self, dataset, lora_cnt=2, id=None):
        self.dataset = dataset
        self.id = id
        self.lora_cnt = lora_cnt

    def __len__(self):
        if self.id is not None:
            return len(self.dataset)
        else:
            return len(self.dataset)

    def __getitem__(self, idx):
        d = self.dataset[idx]
        masking = torch.zeros(self.lora_cnt)
        
        if self.id is not None:
            masking[self.id] = 1           
        else:
            masking[idx % self.lora_cnt] = 1
        
        ids = torch.tensor(d['input_ids'])
        mask = torch.tensor(d['attention_mask'])
        label = torch.tensor(d['label'])
        return ids, mask, label, masking

In [30]:
import math
import torch.nn.functional as F

class Adapter(nn.Linear, loralib.LoRALayer):
    # LoRA implemented in a dense layer
    def __init__(
        self, 
        in_features: int, 
        out_features: int, 
        r: int = 0, 
        lora_alpha: int = 1, 
        lora_dropout: float = 0.,
        fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        merge_weights: bool = True,
        **kwargs
    ):
        loralib.LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
                           merge_weights=merge_weights)

        self.fan_in_fan_out = fan_in_fan_out
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
        self.reset_parameters()
        if fan_in_fan_out:
            self.weight.data = self.weight.data.transpose(0, 1)

    def reset_parameters(self):
        if hasattr(self, 'lora_A'):
            # initialize A the same way as the default for nn.Linear and B to zero
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)

    def train(self, mode: bool = True):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w
        
        # Merge the weights and mark it
        if self.r > 0:
            self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
        self.merged = True       

    def forward(self, x: torch.Tensor):
        if self.r > 0 and not self.merged:          
            result = (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
            return result
        else:
            return None

In [113]:
class MultiLinear(nn.Linear, loralib.LoRALayer):
    def __init__(self, linear, in_features, out_features, r, lora_alpha, lora_dropout, num_adapters=2, merge_weights = True, **kwargs):
        self.num_adapters = num_adapters
        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        self.linear = linear
        loralib.LoRALayer.__init__(self, r, lora_alpha, lora_dropout, merge_weights)
        self.adapters = nn.ModuleList(loralib.Linear(in_features, out_features, r, lora_alpha, lora_dropout, merge_weights) for _ in range(num_adapters))

    def forward(self, x, masking):
        result = None
        
        for i in range(self.num_adapters):
            def T(w):
                return w.transpose(0, 1) if self.adapters[i].fan_in_fan_out else w
            
            if self.adapters[i].r > 0 and not self.adapters[i].merged:
                if result is None:
                    result = self.linear(x)          
                result += (self.adapters[i].lora_dropout(x) @ self.adapters[i].lora_A.transpose(0, 1) @ self.adapters[i].lora_B.transpose(0, 1)) * self.adapters[i].scaling * masking[:, i].view(-1, 1, 1)
            else:
                result += self.linear(x)
                
        return result

class CustomBert(transformers.PreTrainedModel):
    def __init__(self, bert, num_adapters=2):
        super().__init__(config=BertConfig.from_pretrained('google/bert_uncased_L-2_H-128_A-2'))
        self.bert = bert
        self.l1 = nn.Linear(128, 1)
        self.do = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        # Add LoRA layers to the BERT model
        for i, (name, module) in enumerate(self.bert.named_modules()):
            if isinstance(module, nn.Linear) and "encoder" and "attention" in name:
                idx = int(name.split(".")[2])
                if "query" in name:
                    self.bert.encoder.layer[idx].attention.self.query = MultiLinear(module,
                        module.in_features, module.out_features, r=8, lora_alpha=32, lora_dropout=0.1, num_adapters=num_adapters
                    )
                    
                    assert torch.allclose(module.weight, self.bert.encoder.layer[idx].attention.self.query.linear.weight)
                    
                elif "key" in name:
                    self.bert.encoder.layer[idx].attention.self.key = MultiLinear(module,
                        module.in_features, module.out_features, r=8, lora_alpha=32, lora_dropout=0.1, num_adapters=num_adapters
                    )
                    
                    assert torch.allclose(module.weight, self.bert.encoder.layer[idx].attention.self.key.linear.weight)
                elif "value" in name:
                    self.bert.encoder.layer[idx].attention.self.value = MultiLinear(module,
                        module.in_features, module.out_features, r=8, lora_alpha=32, lora_dropout=0.1, num_adapters=num_adapters
                    )
                    
                    assert torch.allclose(module.weight, self.bert.encoder.layer[idx].attention.self.value.linear.weight)
                        
                
                
    def forward(self, x, mask, masking):
        bert_out = self.bert(x, attention_mask=mask, masking=masking)
        o = bert_out.last_hidden_state[:,0,:]
        o = self.do(o)
        o = self.relu(o)
        o = self.l1(o)
        o = self.sigmoid(o)
        return o

In [115]:
# Initialize the custom model
model = CustomBert(bert).to(device)

# Mark only LoRA parameters as trainable
loralib.utils.mark_only_lora_as_trainable(model)

# Training loop
model.train()
criterion = torch.nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)
dataset = IMDBClassificationDataset(tokenized_datasets)
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=8)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [116]:
for epoch in tqdm.tqdm(range(num_epochs)):
    for batch in train_dataloader:
        ids, masks, labels, masking = batch
        labels = labels.type(torch.float)
        o = model(ids.to(device), masks.to(device), masking.to(device))

        loss = criterion(torch.squeeze(o), labels.to(device))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

100%|██████████| 3/3 [00:04<00:00,  1.63s/it]


In [91]:
# save the tokenizer and the model in `./test-model/` directory 
tokenizer.save_pretrained("./test-model/")
model.save_pretrained("./test-model/", push_to_hub=False)

ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.linear.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

In [None]:
from transformers import pipeline

# as this is classification so you need to mention `text-classification` as task
classifier = pipeline('text-classification', model='tanmoyio/test-model')
classifier("This movie was superb")
