In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 70.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 75.3 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 73.4 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive

drive.mount('./drive')

Mounted at ./drive


In [None]:
df_train = pd.read_csv('/content/h2i_01_asset_type_train_v1.csv')
df_valid = pd.read_csv('/content/h2i_01_asset_type_valid_v1.csv')
df_test = pd.read_csv('/content/h2i_01_asset_type_test_v1.csv')

In [None]:
num_labels = len(df_train.label.unique())
df_train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,2428,Как инвестировать в недвижимость подскажи,0
1,3283,можно мне вложиться в инвестиции,1
2,1074,Инвестиции в золото это будет лучше инвестиции,0
3,2490,Скажи мне вот меня интересует вопрос инвестици...,0
4,366,Во что инвестирует чаще всего?,1


In [None]:
model_checkpoint = 'DeepPavlov/rubert-base-cased-conversational'

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased-conversational', 
                                                           num_labels=num_labels)
if device == 'cuda':
  model.to(device);

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 0. Обучим базовую модель на наших данных, глянем метрики и количество параметров в основных слоях

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_texts, labels, device=device):
      self.tokenized_texts = tokenized_texts
      self.labels = labels
      self.device = device

    def __len__(self):
      return len(self.labels)

    def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]).to(self.device) for key, val in self.tokenized_texts.items()}
      item['labels'] = torch.tensor(self.labels[idx]).to(self.device)
      return item

In [None]:

train_tokenized_dataset = tokenizer(list(df_train['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)
valid_tokenized_dataset = tokenizer(list(df_valid['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)
test_tokenized_dataset = tokenizer(list(df_test['text'].values), padding="max_length", 
                              truncation=True, return_tensors='pt', max_length = 512)

train_dataset = CustomDataset(train_tokenized_dataset, df_train['label'])
valid_dataset = CustomDataset(valid_tokenized_dataset, df_valid['label'])
test_dataset = CustomDataset(test_tokenized_dataset, df_test['label'])

In [None]:
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,            # limit the total amount of checkpoints. Deletes the older checkpoints.    
    dataloader_pin_memory=False     # If true issue error while train
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 2602
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1630
  # This is added back by InteractiveShellApp.init_path()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.2098,0.96
2,0.256000,0.154387,0.965714


***** Running Evaluation *****
  Num examples = 350
  Batch size = 2
  # This is added back by InteractiveShellApp.init_path()
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  # This is added back by InteractiveShellApp.init_path()
***** Running Evaluation *****
  Num examples = 350
  Batch size = 2
  # This is added back by InteractiveShellApp.init_path()


KeyboardInterrupt: ignored

## Num of params

In [None]:
total_params_embeddings = sum(
	param.numel() for param in model.bert.embeddings.parameters()
)

total_params_encoder = sum(
	param.numel() for param in model.bert.encoder.parameters()
)


In [None]:
print(f'Count of encoder parameters: {total_params_encoder}')
print(f'Count of embeddings parameters: {total_params_embeddings}')

encoder + embedding ~ 98% of all params

## Get metrics of model

In [None]:
predicted_labels = []
true_labels = []
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)#.to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0)#.to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0)#.to('cpu')
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

## Inference time on CPU/GPU

In [None]:
# GPU Inference time.
%%timeit
c=0
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c+=1

In [None]:
model.to('cpu');

In [None]:
# CPU Inference time.
%%timeit
c_cpu=0
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c_cpu+=1
  if c_cpu % 100 == 0:
    break

  del sys.path[0]


2min 31s ± 1.24 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# true_positive = sum([1 if pred == true_label else 0 for pred, true_label in zip(predicted_labels, true_labels)])
# accuracy = true_positive/len(true_labels)
test_score = f1_score(true_labels, predicted_labels)
print(f'f1_score: {test_score}')
print(f'Inference time on GPU: {12.6/len(test_dataset)} seconds')
print(f'Inference time on CPU: {151/100} seconds')

f1_score: 0.9667673716012085
Inference time on GPU: 0.036 seconds
Inference time on CPU: 1.51 seconds


## Model save

In [None]:
model_save_dir = '/content/drive/MyDrive/models/finetuned_bert/'
torch.save(model.state_dict(), model_save_dir+'pytorch_model.bin')

# 1. Quantinization

## 1.1 Quantization Linear

Загружаю обученную на наших данных модель из гугл диска. Делаю это по той причине, что модель обучалась на GPU, и после квантизации модели не получится делать инференс, поскольку квантизированные слои пока поддерживают только CPU. Перекладка модели на CPU через pytorch не помогает, поэтому приходится загружать модель снова

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_save_dir);
tokenizer = AutoTokenizer.from_pretrained(model_save_dir);

In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

## Check the model size

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 711.502125
Size (MB): 454.981957


## Get metrics

In [None]:
cpu_test_dataset = CustomDataset(test_tokenized_dataset, df_test['label'], 'cpu')

In [None]:
predicted_labels = []
true_labels = []
for sample in cpu_test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = quantized_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()


In [None]:
%%timeit
c=0
for sample in cpu_test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = quantized_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c+=1
  if c % 100 == 0:
    break

  # This is added back by InteractiveShellApp.init_path()


1min 58s ± 2.09 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on CPU: {118/100}')

F1_score: 0.9667673716012085
Inference time on CPU: 1.18


## Save quantized model

In [None]:
quantized_model_save_dir = '/content/drive/MyDrive/models/quantized_bert/'
torch.save(quantized_model.state_dict(), quantized_model_save_dir+'pytorch_model.bin')

## 1.2 Quantize linear + embedding layers

In [None]:
model.to('cpu');

In [None]:
from torch.quantization.qconfig import float_qparams_weight_only_qconfig, default_dynamic_qconfig

qconfig_dict = {
    torch.nn.Embedding : float_qparams_weight_only_qconfig,
    torch.nn.Linear: default_dynamic_qconfig
}
full_quantized_model = torch.quantization.quantize_dynamic(model, qconfig_dict)

In [None]:
predicted_labels = []
true_labels = []
for sample in tqdm(cpu_test_dataset):
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = full_quantized_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()
  9%|▉         | 33/350 [00:42<06:50,  1.30s/it]


KeyboardInterrupt: ignored

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(full_quantized_model)

Size (MB): 711.502125
Size (MB): 179.324089


In [None]:
%%timeit
c=0
for sample in cpu_test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = full_quantized_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c+=1
  if c % 100 == 0:
    break

In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on CPU: {113/100}')

F1_score: 0.9565217391304348
Inference time on CPU: 1.18


# 2. Matrix Decomposition

In [None]:
!pip3 install tensorly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorly
  Downloading tensorly-0.7.0-py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 5.1 MB/s 
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[K     |████████████████████████████████| 154 kB 71.3 MB/s 
Installing collected packages: nose, tensorly
Successfully installed nose-1.3.7 tensorly-0.7.0


In [None]:
import tensorly
tensorly.set_backend('pytorch')
from tensorly.decomposition import tucker, partial_tucker
from tensorly.tenalg import mode_dot

## Tucker decomposition

In [None]:
def tucker_decomposition_conv_layer(weight, bias, rank, device):
    """ Gets a conv layer, 
        returns a list of products of the Tucker decomposition.
    """

    
    core, [last, first] = \
        partial_tucker(weight.data, \
            modes=[0,1], rank=rank, init='svd')
    
    first_layer = torch.nn.Conv1d(in_channels=first.shape[0], \
            out_channels=first.shape[1], kernel_size=1)
    
    core_layer = torch.nn.Conv1d(in_channels=core.shape[1], \
            out_channels=core.shape[0], kernel_size=1)
    
    last_layer = torch.nn.Conv1d(in_channels=last.shape[1], \
        out_channels=last.shape[0], kernel_size=1)
    
    last_layer.bias.data = bias.data

    first_layer.weight.data = \
        torch.transpose(first, 1, 0).unsqueeze(-1)
    last_layer.weight.data = last.unsqueeze(-1)
    core_layer.weight.data = core

    new_layers = [first_layer.to(device), core_layer.to(device), last_layer.to(device)]
    return new_layers

In [None]:
class TuckerLinear(torch.nn.Module):
    def __init__(self, fc_w, fc_b, rank):
        super().__init__()
        if fc_w.shape[0] == fc_w.shape[1]:
          self.first_layer, self.core_layer, self.last_layer = tucker_decomposition_conv_layer(torch.transpose(fc_w, 0, 1).unsqueeze(2), fc_b, rank = rank, device=device)
        else:
          self.first_layer, self.core_layer, self.last_layer = tucker_decomposition_conv_layer(fc_w.unsqueeze(2), fc_b, rank = rank, device=device)
        self.first_layer.to(device)
        self.core_layer.to(device)
        self.last_layer.to(device)

    def forward(self, x):
        x3 = self.first_layer(x.transpose(1, 2))
        x3 = self.core_layer(x3.transpose(2, 0))
        x3 = self.last_layer(x3.transpose(2, 0))
       
        
        return (x3.transpose(1, 2).contiguous())

In [None]:
def get_predict(model):
  predicted_labels = []
  true_labels = []
  c = 0
  for sample in test_dataset:
    true_label = sample['labels']
    input_ids = sample['input_ids'].unsqueeze(0)
    attention_mask = sample['attention_mask'].unsqueeze(0)
    token_type_ids = sample['token_type_ids'].unsqueeze(0)
    logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                  token_type_ids=token_type_ids).logits
    predicted_labels.append(logits.argmax().item())
    true_labels.append(true_label.item())
  return true_labels, predicted_labels

In [None]:
save_pretrained = '/content/drive/MyDrive/models/finetuned_bert/'
model = AutoModelForSequenceClassification.from_pretrained(save_pretrained);
tokenizer = AutoTokenizer.from_pretrained(save_pretrained);

Изначальное качество модели ухудшается с увеличением количества слоёв, которые декомпозируем. Качество возвращается при повторном обучении в небольшое кол-во эпох.

In [None]:
def decompose_layer(layer, rank):
    fc_w = layer.weight.data.cpu()
    fc_b = layer.bias
    factorized_layer = TuckerLinear(fc_w, fc_b, rank = rank)
    return factorized_layer

rank = [20, 20]
for i in [0,1,2,3,4,5,6,7,8,9,10,11]:
    model.bert.encoder.layer[i].attention.self.query = decompose_layer(model.bert.encoder.layer[i].attention.self.query, rank)
    model.bert.encoder.layer[i].attention.self.key = decompose_layer(model.bert.encoder.layer[i].attention.self.key, rank)
    model.bert.encoder.layer[i].attention.self.value = decompose_layer(model.bert.encoder.layer[i].attention.self.value, rank)
    model.bert.encoder.layer[i].attention.output.dense = decompose_layer(model.bert.encoder.layer[i].attention.output.dense, rank)
    model.bert.encoder.layer[i].intermediate.dense = decompose_layer(model.bert.encoder.layer[i].intermediate.dense, rank)
    model.bert.encoder.layer[i].output.dense = decompose_layer(model.bert.encoder.layer[i].output.dense, rank)

  # This is added back by InteractiveShellApp.init_path()


TypeError: ignored

In [None]:
true_labels, predicted_labels = get_predict(model.to(device))
f1_score(true_labels, predicted_labels)

  # This is added back by InteractiveShellApp.init_path()


0.6460348162475822

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
# print_size_of_model(quantized_model)

Size (MB): 385.280109


## A bit finetune the model

In [None]:
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,            # limit the total amount of checkpoints. Deletes the older checkpoints.    
    dataloader_pin_memory=False     # If true issue error while train
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 2602
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 326
  # This is added back by InteractiveShellApp.init_path()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.25055,0.948571


***** Running Evaluation *****
  Num examples = 350
  Batch size = 2


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=326, training_loss=0.16730065140987466, metrics={'train_runtime': 169.7085, 'train_samples_per_second': 15.332, 'train_steps_per_second': 1.921, 'total_flos': 32476269244416.0, 'train_loss': 0.16730065140987466, 'epoch': 1.0})

## Metrics

In [None]:
true_labels, predicted_labels = get_predict(model)


  # This is added back by InteractiveShellApp.init_path()


In [None]:
# GPU inference time
%%timeit
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits

  # This is added back by InteractiveShellApp.init_path()


12.9 s ± 301 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
model.to('cpu');

In [None]:
# CPU inference time
%%timeit
c = 0
for sample in cpu_test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c += 1
  if c == 100:
    break

  # This is added back by InteractiveShellApp.init_path()


1min 18s ± 655 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on GPU: {12.9/len(test_dataset)}')
print(f'Inference time on CPU: {78/100}')

F1_score: 0.9665653495440729
Inference time on GPU: 0.03685714285714286
Inference time on CPU: 0.78


In [None]:
decomposed_model_save_dir = '/content/drive/MyDrive/models/decomposed_bert/'
torch.save(model.state_dict(), decomposed_model_save_dir+'pytorch_model.bin')

# 3. Matrix Decomposition + Quantization emb layer

In [None]:
save_pretrained = '/content/drive/MyDrive/models/finetuned_bert/'
decomposed_model = AutoModelForSequenceClassification.from_pretrained(save_pretrained);
tokenizer = AutoTokenizer.from_pretrained(save_pretrained);

In [None]:
def decompose_layer(layer, rank):
    fc_w = layer.weight.data.cpu()
    fc_b = layer.bias
    factorized_layer = TuckerLinear(fc_w, fc_b, rank = rank)
    return factorized_layer

rank = [20, 20]
for i in [0,1,2,3,4,5,6,7,8,9,10,11]:
    decomposed_model.bert.encoder.layer[i].attention.self.query = decompose_layer(decomposed_model.bert.encoder.layer[i].attention.self.query, rank)
    decomposed_model.bert.encoder.layer[i].attention.self.key = decompose_layer(decomposed_model.bert.encoder.layer[i].attention.self.key, rank)
    decomposed_model.bert.encoder.layer[i].attention.self.value = decompose_layer(decomposed_model.bert.encoder.layer[i].attention.self.value, rank)
    decomposed_model.bert.encoder.layer[i].attention.output.dense = decompose_layer(decomposed_model.bert.encoder.layer[i].attention.output.dense, rank)
    decomposed_model.bert.encoder.layer[i].intermediate.dense = decompose_layer(decomposed_model.bert.encoder.layer[i].intermediate.dense, rank)
    decomposed_model.bert.encoder.layer[i].output.dense = decompose_layer(decomposed_model.bert.encoder.layer[i].output.dense, rank)



In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(decomposed_model)

Size (MB): 385.279341


In [None]:
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,            # limit the total amount of checkpoints. Deletes the older checkpoints.    
    dataloader_pin_memory=False     # If true issue error while train
)

trainer = Trainer(
    model=decomposed_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 2602
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1953
  # This is added back by InteractiveShellApp.init_path()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5502,0.206003,0.957143
2,0.2035,0.205351,0.962857
3,0.1282,0.171685,0.971429


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1500] due to args.save_total_limit
  # This is added back by InteractiveShellApp.init_path()
***** Running Evaluation *****
  Num examples = 350
  Batch size = 2
  # This is added back by InteractiveShellApp.init_path()
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-500] due to args.save_total_limit
  # This is added back by InteractiveShellApp.init_path()
***** Running Evaluation *****
  Num examples = 350
  Batch size = 2
  # This is added back by InteractiveShellApp.init_path()
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.j

TrainOutput(global_step=1953, training_loss=0.2414148665670853, metrics={'train_runtime': 650.1683, 'train_samples_per_second': 12.006, 'train_steps_per_second': 3.004, 'total_flos': 97428807733248.0, 'train_loss': 0.2414148665670853, 'epoch': 3.0})

In [None]:
true_labels, predicted_labels = get_predict(decomposed_model)
f1_score(true_labels, predicted_labels)

  # This is added back by InteractiveShellApp.init_path()


0.9671641791044777

## Quantize embedding layer

In [None]:
del model

In [None]:
decomposed_model.to('cpu')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): TuckerLinear(
                (first_layer): Conv1d(768, 20, kernel_size=(1,), stride=(1,))
                (core_layer): Conv1d(20, 20, kernel_size=(1,), stride=(1,))
                (last_layer): Conv1d(20, 768, kernel_size=(1,), stride=(1,))
              )
              (key): TuckerLinear(
                (first_layer): Conv1d(768, 20, kernel_size=(1,), stride=(1,))
                (core_layer): Conv1d(20, 20, kernel_size=(1,), stride=(1,)

In [None]:
from torch.quantization.qconfig import float_qparams_weight_only_qconfig, default_dynamic_qconfig

qconfig_dict = {
    torch.nn.Embedding : float_qparams_weight_only_qconfig,
}
quantized_decomposed_model = torch.quantization.quantize_dynamic(decomposed_model, qconfig_dict)

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(quantized_decomposed_model)
print_size_of_model(decomposed_model)

Size (MB): 109.615265
Size (MB): 385.273261


In [None]:
quantized_decomposed_model.to('cpu');

In [None]:
from tqdm import tqdm

predicted_labels = []
true_labels = []
for sample in tqdm(cpu_test_dataset):
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = quantized_decomposed_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 350/350 [04:49<00:00,  1.21it/s]


In [None]:
# CPU inference time
%%timeit
c = 0
for sample in cpu_test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = quantized_decomposed_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c += 1
  if c == 10:
    break

  # This is added back by InteractiveShellApp.init_path()


8.17 s ± 22.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on CPU: {8.17/10}')

F1_score: 0.9671641791044777
Inference time on CPU: 0.817


# 4. Distallation

In [None]:
from collections import Counter
from tqdm.auto import tqdm, trange
from transformers import BertForPreTraining, BertModel, BertConfig
from transformers import BertTokenizerFast

## 4.1 Basic Distillation

## Create BPE tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
X_train, y_train = df_train.text.values, df_train.label.values

In [None]:
cnt = Counter()
for text in tqdm(X_train):
    cnt.update(tokenizer(text)['input_ids'])

  0%|          | 0/2602 [00:00<?, ?it/s]

In [None]:
resulting_vocab = {
    tokenizer.vocab[k] for k in tokenizer.special_tokens_map.values()
}

In [None]:
for k, v in cnt.items():
    if v > 0:
        resulting_vocab.add(k)

In [None]:
resulting_vocab = sorted(resulting_vocab)
print(len(resulting_vocab))

1516


In [None]:
tokenizer.save_pretrained('./bert_distill')

('./bert_distill/tokenizer_config.json',
 './bert_distill/special_tokens_map.json',
 './bert_distill/vocab.txt',
 './bert_distill/added_tokens.json',
 './bert_distill/tokenizer.json')

In [None]:
inv_voc = {idx: word for word, idx in tokenizer.vocab.items()}

In [None]:
with open('./bert_distill/vocab.txt', 'w', encoding='utf-8') as f:
    for idx in resulting_vocab:
        f.write(inv_voc[idx] + '\n')

In [None]:
# remove tokenizer.json and resave
!rm -rf bert_distill/tokenizer.json

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('./bert_distill')

In [None]:
tokenizer.save_pretrained('./bert_distill')

('./bert_distill/tokenizer_config.json',
 './bert_distill/special_tokens_map.json',
 './bert_distill/vocab.txt',
 './bert_distill/added_tokens.json',
 './bert_distill/tokenizer.json')

## Create distill model.

In [None]:
from transformers import BertForPreTraining, BertModel, BertConfig

In [None]:
del tokenizer

NameError: ignored

In [None]:
tokenizer_distill = BertTokenizerFast.from_pretrained('./bert_distill')

In [None]:
config = BertConfig(
    emb_size=256,
    hidden_size=256,
    intermediate_size=256,
    max_position_embeddings=512,
    num_attention_heads=8,
    num_hidden_layers=5,
    vocab_size=tokenizer_distill.vocab_size
)

In [None]:
model = BertForPreTraining(config)

In [None]:
model.save_pretrained('./bert_distill')

In [None]:
save_pretrained = '/content/drive/MyDrive/models/finetuned_bert/'
teacher = AutoModelForSequenceClassification.from_pretrained(save_pretrained);
tokenizer_teacher = AutoTokenizer.from_pretrained(save_pretrained);

In [None]:
# copy input embeddings accordingly with resulting_vocab
model.bert.embeddings.word_embeddings.weight.data = teacher.bert.embeddings.word_embeddings.weight.data[resulting_vocab, :256].clone()
model.bert.embeddings.position_embeddings.weight.data = teacher.bert.embeddings.position_embeddings.weight.data[:, :256].clone()

In [None]:
model.save_pretrained('./bert_distill')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('./bert_distill', num_labels=2)

Some weights of the model checkpoint at ./bert_distill were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

## Adapter

In [None]:
adapter_emb = torch.nn.Linear(256, 768)

## CustomDatasets

In [None]:
class CustomDatasetDistillation(torch.utils.data.Dataset):
    def __init__(self, tokenized_texts_distill, tokenized_texts_teacher, labels, device=device):
      self.tokenized_texts_distill = tokenized_texts_distill
      self.tokenized_texts_teacher = tokenized_texts_teacher
      self.labels = labels
      self.device = device

    def __len__(self):
      return len(self.labels)

    def __getitem__(self, idx):
      input_distill = {key: torch.tensor(val[idx]).to(self.device) for key, val in self.tokenized_texts_distill.items()}
      input_teacher = {key: torch.tensor(val[idx]).to(self.device) for key, val in self.tokenized_texts_teacher.items()}
      label = torch.tensor(self.labels[idx], dtype=torch.long).to(self.device)
      return input_distill, input_teacher, label

train_tokenized_dataset_distill = tokenizer_distill(list(df_train['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)
valid_tokenized_dataset_distill = tokenizer_distill(list(df_valid['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)

train_tokenized_dataset_teacher = tokenizer_teacher(list(df_train['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)
valid_tokenized_dataset_teacher = tokenizer_teacher(list(df_valid['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)


test_tokenized_dataset_distill = tokenizer_distill(list(df_test['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)


train_dataset_distill = CustomDatasetDistillation(train_tokenized_dataset_distill, train_tokenized_dataset_teacher, df_train['label'])
valid_dataset_distill = CustomDatasetDistillation(valid_tokenized_dataset_distill, train_tokenized_dataset_teacher, df_valid['label'])
test_dataset_distill = CustomDataset(test_tokenized_dataset_distill, df_test['label'])

## Train Loop Embedding Approximation

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset_distill, batch_size=32, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset_distill, batch_size=8, shuffle=False)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from itertools import chain

EPOCHS = 10

optimizer = AdamW(
   params=[p for p in chain(
        model.parameters(), 
        adapter_emb.parameters(),
        ) if p.requires_grad], 
    lr=3e-5,
    weight_decay=0.001,
    correct_bias=True
    )
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * EPOCHS)

In [None]:
model.to(device);
teacher.to(device);
adapter_emb.to(device);

In [None]:
ce_loss = torch.nn.CrossEntropyLoss().to(device)
mse_loss = torch.nn.MSELoss().to(device)
def loss_fn(teacher_emb, model_emb, true_label, predicted_labels, a):
  return a*ce_loss(predicted_labels, true_label) + (1-a)*mse_loss(teacher_emb, model_emb)

alpha = 0.75
teacher.eval()
for ep in range(EPOCHS):
  model.train()
  losses = []
  correct_predictions = 0
  optimizer.zero_grad()
  print(f'Train epoch num {ep}')
  for input_distill, input_teacher, label in tqdm(train_dataloader):
    with torch.no_grad():
      teacher_out = teacher(**input_teacher, output_hidden_states=True)
      teacher_emb = teacher.bert.pooler(teacher_out.hidden_states[-1])

    distill_out = model(**input_distill, output_hidden_states=True)
    distill_emb = model.bert.pooler(distill_out.hidden_states[-1])
    distill_emb_adapter = adapter_emb(distill_emb)

    distill_logits = distill_out.logits
    predicted_labels = torch.argmax(distill_logits, dim=1)
    correct_predictions += torch.sum(predicted_labels == label)
    
    loss = loss_fn(teacher_emb, distill_emb_adapter, label, distill_logits, alpha)
    losses.append(loss.item())

    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  train_acc = correct_predictions.double() / len(train_dataset_distill)
  train_loss = np.mean(losses)
  print(f'Epoch - {ep}, loss: {train_loss}, acc: {train_acc}')
  model.eval()
  losses = []
  correct_predictions = 0
  for input_distill, input_teacher, label in tqdm(valid_dataloader):
    
    with torch.no_grad():
      teacher_out = teacher(**input_teacher, output_hidden_states=True)
      teacher_emb = teacher.bert.pooler(teacher_out.hidden_states[-1])

      distill_out = model(**input_distill, output_hidden_states=True)
      distill_emb = model.bert.pooler(distill_out.hidden_states[-1])
      distill_emb_adapter = adapter_emb(distill_emb)

      distill_logits = distill_out.logits
      predicted_labels = torch.argmax(distill_logits, dim=1)
      correct_predictions += torch.sum(predicted_labels == label)
      
      loss = loss_fn(teacher_emb, distill_emb_adapter, label, distill_logits, alpha)
      losses.append(loss.item())
  valid_acc = correct_predictions.double() / len(valid_dataset_distill)
  print(f'Evaluation loss: {np.mean(losses)}, accuracy: {valid_acc}')

Train epoch num 0


  0%|          | 0/82 [00:00<?, ?it/s]

  if sys.path[0] == '':
  del sys.path[0]


Epoch - 0, loss: 0.619223036417147, acc: 0.5284396617986165


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.6124571372162212, accuracy: 0.5171428571428571
Train epoch num 1


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 1, loss: 0.5838037885543776, acc: 0.6491160645657187


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.4558387893167409, accuracy: 0.8714285714285714
Train epoch num 2


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 2, loss: 0.3115575940507214, acc: 0.8800922367409685


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.27861080691218376, accuracy: 0.9428571428571428
Train epoch num 3


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 3, loss: 0.18235578365260507, acc: 0.9254419677171407


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.29924908483570273, accuracy: 0.9485714285714286
Train epoch num 4


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 4, loss: 0.13037775495521178, acc: 0.9496541122213682


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.30058246690102597, accuracy: 0.9457142857142857
Train epoch num 5


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 5, loss: 0.1105912739365566, acc: 0.9565718677940046


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.29712838357822463, accuracy: 0.9514285714285714
Train epoch num 6


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 6, loss: 0.09006048988823484, acc: 0.962336664104535


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.31393681170249527, accuracy: 0.9514285714285714
Train epoch num 7


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 7, loss: 0.08478066528488587, acc: 0.9642582628747117


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.3097110785205256, accuracy: 0.9542857142857143
Train epoch num 8


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 8, loss: 0.07666463836482386, acc: 0.9727132974634897


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.31109940277581866, accuracy: 0.9485714285714286
Train epoch num 9


  0%|          | 0/82 [00:00<?, ?it/s]

Epoch - 9, loss: 0.07627733110836367, acc: 0.9723289777094543


  0%|          | 0/44 [00:00<?, ?it/s]

Evaluation loss: 0.30984764520756225, accuracy: 0.9485714285714286


## Get metrics

In [None]:
from tqdm import tqdm

predicted_labels = []
true_labels = []
for sample in tqdm(test_dataset_distill):
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 350/350 [00:04<00:00, 75.45it/s]


In [None]:
# GPU inference time
%%timeit
for sample in test_dataset_distill:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits

  # This is added back by InteractiveShellApp.init_path()


1.91 s ± 283 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
model.to('cpu')

In [None]:
# CPU inference time
%%timeit
c = 0
for sample in test_dataset_distill:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c += 1
  if c == 100:
    break

  # This is added back by InteractiveShellApp.init_path()


11.2 s ± 1.07 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')


In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on GPU: {2/len(test_dataset_distill)}')
print(f'Inference time on CPU: {12/100}')
print(f'Size of model in MB: {print_size_of_model(model)}')

F1_score: 0.9520958083832335
Inference time on GPU: 0.005714285714285714
Inference time on CPU: 0.12
Size (MB): 10.299799
Size of model in MB: None


## 4.2 Quantize Distillated Model

In [None]:
from torch.quantization.qconfig import float_qparams_weight_only_qconfig, default_dynamic_qconfig

qconfig_dict = {
    torch.nn.Embedding : float_qparams_weight_only_qconfig,
    torch.nn.Linear: default_dynamic_qconfig
}
quantized_distill_model = torch.quantization.quantize_dynamic(model, qconfig_dict)

In [None]:
from tqdm import tqdm

predicted_labels = []
true_labels = []
for sample in tqdm(test_dataset_distill):
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = quantized_distill_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 350/350 [00:41<00:00,  8.39it/s]


In [None]:
# CPU inference time
%%timeit
c = 0
for sample in test_dataset_distill:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = quantized_distill_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c += 1
  if c == 100:
    break

  # This is added back by InteractiveShellApp.init_path()


10.2 s ± 418 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on CPU: {10.5/100}')
print(f'Size of model in MB: {print_size_of_model(quantized_distill_model)}')

F1_score: 0.9520958083832335
Inference time on CPU: 0.105
Size (MB): 2.687745
Size of model in MB: None


## 4.3 Matrix Decomposition of distillated model

In [None]:
def decompose_layer(layer, rank):
    fc_w = layer.weight.data.cpu()
    fc_b = layer.bias
    factorized_layer = TuckerLinear(fc_w, fc_b, rank = rank)
    return factorized_layer

rank = [15, 15]
for i in range(5):
    model.bert.encoder.layer[i].attention.self.query = decompose_layer(model.bert.encoder.layer[i].attention.self.query, rank)
    model.bert.encoder.layer[i].attention.self.key = decompose_layer(model.bert.encoder.layer[i].attention.self.key, rank)
    model.bert.encoder.layer[i].attention.self.value = decompose_layer(model.bert.encoder.layer[i].attention.self.value, rank)
    model.bert.encoder.layer[i].attention.output.dense = decompose_layer(model.bert.encoder.layer[i].attention.output.dense, rank)
    model.bert.encoder.layer[i].intermediate.dense = decompose_layer(model.bert.encoder.layer[i].intermediate.dense, rank)
    model.bert.encoder.layer[i].output.dense = decompose_layer(model.bert.encoder.layer[i].output.dense, rank)



In [None]:
model.to('cuda')

In [None]:
from tqdm import tqdm

predicted_labels = []
true_labels = []
for sample in tqdm(test_dataset_distill):
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 350/350 [00:16<00:00, 21.61it/s]


### Finetune Decomposed model

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from itertools import chain

EPOCHS = 3

optimizer = AdamW(
   params=[p for p in chain(
        model.parameters(), 
        adapter_emb.parameters(),
        ) if p.requires_grad], 
    lr=1e-4,
    weight_decay=0.001,
    correct_bias=True
    )
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * EPOCHS)

In [None]:
ce_loss = torch.nn.CrossEntropyLoss().to(device)
mse_loss = torch.nn.MSELoss().to(device)
def loss_fn(teacher_emb, model_emb, true_label, predicted_labels, a):
  return a*ce_loss(predicted_labels, true_label) + (1-a)*mse_loss(teacher_emb, model_emb)

alpha = 0.75
teacher.eval()
for ep in range(EPOCHS):
  model.train()
  losses = []
  correct_predictions = 0
  optimizer.zero_grad()
  print(f'Train epoch num {ep}')
  for input_distill, input_teacher, label in tqdm(train_dataloader):
    with torch.no_grad():
      teacher_out = teacher(**input_teacher, output_hidden_states=True)
      teacher_emb = teacher.bert.pooler(teacher_out.hidden_states[-1])

    distill_out = model(**input_distill, output_hidden_states=True)
    distill_emb = model.bert.pooler(distill_out.hidden_states[-1])
    distill_emb_adapter = adapter_emb(distill_emb)

    distill_logits = distill_out.logits
    predicted_labels = torch.argmax(distill_logits, dim=1)
    correct_predictions += torch.sum(predicted_labels == label)
    
    loss = loss_fn(teacher_emb, distill_emb_adapter, label, distill_logits, alpha)
    losses.append(loss.item())

    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  train_acc = correct_predictions.double() / len(train_dataset_distill)
  train_loss = np.mean(losses)
  print(f'Epoch - {ep}, loss: {train_loss}, acc: {train_acc}')
  model.eval()
  losses = []
  correct_predictions = 0
  for input_distill, input_teacher, label in tqdm(valid_dataloader):
    
    with torch.no_grad():
      teacher_out = teacher(**input_teacher, output_hidden_states=True)
      teacher_emb = teacher.bert.pooler(teacher_out.hidden_states[-1])

      distill_out = model(**input_distill, output_hidden_states=True)
      distill_emb = model.bert.pooler(distill_out.hidden_states[-1])
      distill_emb_adapter = adapter_emb(distill_emb)

      distill_logits = distill_out.logits
      predicted_labels = torch.argmax(distill_logits, dim=1)
      correct_predictions += torch.sum(predicted_labels == label)
      
      loss = loss_fn(teacher_emb, distill_emb_adapter, label, distill_logits, alpha)
      losses.append(loss.item())
  valid_acc = correct_predictions.double() / len(valid_dataset_distill)
  print(f'Evaluation loss: {np.mean(losses)}, accuracy: {valid_acc}')

Train epoch num 0


  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 82/82 [01:46<00:00,  1.30s/it]


Epoch - 0, loss: 0.5776614597657832, acc: 0.6602613374327441


100%|██████████| 44/44 [00:12<00:00,  3.54it/s]


Evaluation loss: 0.5059908127242868, accuracy: 0.8514285714285714
Train epoch num 1


100%|██████████| 82/82 [01:46<00:00,  1.29s/it]


Epoch - 1, loss: 0.23098172015714935, acc: 0.9027671022290545


100%|██████████| 44/44 [00:12<00:00,  3.54it/s]


Evaluation loss: 0.2969794925302267, accuracy: 0.9514285714285714
Train epoch num 2


100%|██████████| 82/82 [01:46<00:00,  1.29s/it]


Epoch - 2, loss: 0.09744622878639436, acc: 0.958109146810146


100%|██████████| 44/44 [00:12<00:00,  3.55it/s]

Evaluation loss: 0.30137287300418725, accuracy: 0.9514285714285714





In [None]:
from tqdm import tqdm

predicted_labels = []
true_labels = []
for sample in tqdm(test_dataset_distill):
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 350/350 [00:07<00:00, 43.82it/s]


In [None]:
# GPU inference time
%%timeit
for sample in test_dataset_distill:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits

  # This is added back by InteractiveShellApp.init_path()


6.67 s ± 1.44 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
model.to('cpu');

In [None]:
# CPU inference time
%%timeit
c = 0
for sample in test_dataset_distill:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c += 1
  if c == 100:
    break

  # This is added back by InteractiveShellApp.init_path()


10.7 s ± 1.17 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on GPU: {7/len(test_dataset_distill)}')
print(f'Inference time on CPU: {11/100}')
print(f'Size of model in MB: {print_size_of_model(model)}')

F1_score: 0.9611940298507463
Inference time on GPU: 0.02
Inference time on CPU: 0.11
Size (MB): 3.431645
Size of model in MB: None


## Quantization embedding layer of distill model with decomposed layers

In [None]:
qconfig_dict = {
    torch.nn.Embedding : float_qparams_weight_only_qconfig,
    torch.nn.Linear: default_dynamic_qconfig
}
quantized_distill_model = torch.quantization.quantize_dynamic(model, qconfig_dict)

In [None]:
from tqdm import tqdm

predicted_labels = []
true_labels = []
for sample in tqdm(test_dataset_distill):
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = quantized_distill_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 350/350 [00:42<00:00,  8.27it/s]


In [None]:
# CPU inference time
%%timeit
c = 0
for sample in test_dataset_distill:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = quantized_distill_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c += 1
  if c == 100:
    break

  # This is added back by InteractiveShellApp.init_path()


9.9 s ± 200 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
print(f'F1_score: {f1_score(true_labels, predicted_labels)}')
print(f'Inference time on CPU: {10/100}')
print(f'Size of model in MB: {print_size_of_model(quantized_distill_model)}')

F1_score: 0.9580838323353293
Inference time on CPU: 0.1
Size (MB): 1.694217
Size of model in MB: None
