In [None]:
!pip install transformers datasets

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive

drive.mount('./drive')

Mounted at ./drive


In [None]:
df_train = pd.read_csv('/content/h2i_01_asset_type_train_v1.csv')
df_valid = pd.read_csv('/content/h2i_01_asset_type_valid_v1.csv')
df_test = pd.read_csv('/content/h2i_01_asset_type_test_v1.csv')

In [None]:
num_labels = len(df_train.label.unique())
df_train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,2428,Как инвестировать в недвижимость подскажи,0
1,3283,можно мне вложиться в инвестиции,1
2,1074,Инвестиции в золото это будет лучше инвестиции,0
3,2490,Скажи мне вот меня интересует вопрос инвестици...,0
4,366,Во что инвестирует чаще всего?,1


In [None]:
model_checkpoint = 'DeepPavlov/rubert-base-cased-conversational'
model = AutoModelForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased-conversational', 
                                                           num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
if device == 'cuda':
  model.to(device);

Downloading config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

KeyboardInterrupt: ignored

# 0. Обучим базовую модель на наших данных, глянем метрики и количество параметров в основных слоях

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_texts, labels):
      self.tokenized_texts = tokenized_texts
      self.labels = labels

    def __len__(self):
      return len(self.labels)

    def __getitem__(self, idx):
      # sample = self.tokenized_texts[idx]
      # sample['label'] = self.labels[idx]
      # return sample
      item = {key: torch.tensor(val[idx]).to(device) for key, val in self.tokenized_texts.items()}
      item['labels'] = torch.tensor(self.labels[idx]).to(device)
      return item

train_tokenized_dataset = tokenizer(list(df_train['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)
valid_tokenized_dataset = tokenizer(list(df_valid['text'].values), padding="max_length",
                              truncation=True, return_tensors='pt', max_length = 512)
test_tokenized_dataset = tokenizer(list(df_test['text'].values), padding="max_length", 
                              truncation=True, return_tensors='pt', max_length = 512)

train_dataset = CustomDataset(train_tokenized_dataset, df_train['label'])
valid_dataset = CustomDataset(valid_tokenized_dataset, df_valid['label'])
test_dataset = CustomDataset(test_tokenized_dataset, df_test['label'])

In [None]:
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,            # limit the total amount of checkpoints. Deletes the older checkpoints.    
    dataloader_pin_memory=False     # If true issue error while train
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 2602
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 3903
  del sys.path[0]


Epoch,Training Loss,Validation Loss


## Num of params

In [None]:
total_params_embeddings = sum(
	param.numel() for param in model.bert.embeddings.parameters()
)

total_params_encoder = sum(
	param.numel() for param in model.bert.encoder.parameters()
)


In [None]:
print(f'Count of encoder parameters: {total_params_encoder}')
print(f'Count of embeddings parameters: {total_params_embeddings}')

Count of encoder parameters: 85054464
Count of embeddings parameters: 92208384


encoder + embedding ~ 98% of all params

## Get metrics of model

In [None]:
predicted_labels = []
true_labels = []
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)#.to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0)#.to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0)#.to('cpu')
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  del sys.path[0]


In [None]:
%%timeit
c=0
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c+=1
  if c % 10 == 0:
    break

  del sys.path[0]


100 loops, best of 5: 3.14 ms per loop


In [None]:
true_positive = sum([1 if pred == true_label else 0 for pred, true_label in zip(predicted_labels, true_labels)])
accuracy = true_positive/len(true_labels)
print(f'Accuracy: {accuracy}')
print(f'Inference time on CPU: {14.9/10}')

Accuracy: 0.9714285714285714
Inference time on CPU: 1.49


# 1. Quantinization

Загружаю обученную на наших данных модель из гугл диска. Делаю это по той причине, что модель обучалась на GPU, и после квантизации модели не получится делать инференс, поскольку квантизированные слои пока поддерживают только CPU. Перекладка модели на CPU через pytorch не помогает, поэтому приходится загружать модель снова

In [None]:
save_pretrained = '/content/drive/MyDrive/models/quantized_bert'
model = AutoModelForSequenceClassification.from_pretrained(save_pretrained)
tokenizer = AutoTokenizer.from_pretrained(save_pretrained)

In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

## Check the model size

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 711.502125
Size (MB): 454.981957


## Check the model accuracy

In [None]:
predicted_labels = []
true_labels = []
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = quantized_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  predicted_labels.append(logits.argmax().item())
  true_labels.append(true_label.item())

  del sys.path[0]


In [None]:
%%timeit
c=0
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0).to('cpu')
  attention_mask = sample['attention_mask'].unsqueeze(0).to('cpu')
  token_type_ids = sample['token_type_ids'].unsqueeze(0).to('cpu')
  logits = quantized_model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits
  c+=1
  if c % 10 == 0:
    break

  del sys.path[0]


1 loop, best of 5: 11.6 s per loop


In [None]:
true_positive = sum([1 if pred == true_label else 0 for pred, true_label in zip(predicted_labels, true_labels)])
accuracy = true_positive/len(true_labels)
print(f'Accuracy: {accuracy}')
print(f'Inference time on CPU: {11.5/10}')

Accuracy: 0.9685714285714285
Inference time on CPU: 1.15


# 2. Matrix Decomposition

In [None]:
!pip3 install tensorly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorly
  Downloading tensorly-0.7.0-py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 31.1 MB/s 
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[K     |████████████████████████████████| 154 kB 66.0 MB/s 
Installing collected packages: nose, tensorly
Successfully installed nose-1.3.7 tensorly-0.7.0


In [None]:
import tensorly
tensorly.set_backend('pytorch')
from tensorly.decomposition import tucker, partial_tucker
from tensorly.tenalg import mode_dot

## Tucker decomposition

In [None]:
def tucker_decomposition_conv_layer(weight, bias, rank, device):
    """ Gets a conv layer, 
        returns a list of products of the Tucker decomposition.
    """

    
    core, [last, first] = \
        partial_tucker(weight.data, \
            modes=[0,1], rank=rank, init='svd')
    
    first_layer = torch.nn.Conv1d(in_channels=first.shape[0], \
            out_channels=first.shape[1], kernel_size=1)
    
    core_layer = torch.nn.Conv1d(in_channels=core.shape[1], \
            out_channels=core.shape[0], kernel_size=1)
    
    last_layer = torch.nn.Conv1d(in_channels=last.shape[1], \
        out_channels=last.shape[0], kernel_size=1)
    
    last_layer.bias.data = bias.data

    first_layer.weight.data = \
        torch.transpose(first, 1, 0).unsqueeze(-1)
    last_layer.weight.data = last.unsqueeze(-1)
    core_layer.weight.data = core

    new_layers = [first_layer.to(device), core_layer.to(device), last_layer.to(device)]
    return new_layers

In [None]:
class TuckerLinear(torch.nn.Module):
    def __init__(self, fc_w, fc_b, rank):
        super().__init__()

        self.first_layer, self.core_layer, self.last_layer = tucker_decomposition_conv_layer(torch.transpose(fc_w, 0, 1).unsqueeze(2), fc_b, rank = rank, device=device)
        self.first_layer.to(device)
        self.core_layer.to(device)
        self.last_layer.to(device)

    def forward(self, x):
        x3 = self.first_layer(x.transpose(1, 2))
        x3 = self.core_layer(x3.transpose(2, 0))
        x3 = self.last_layer(x3.transpose(2, 0))
       
        #x3 = self.core_layer(x3.transpose(2, 0))
        
        return (x3.transpose(1, 2).contiguous())

Аналог класса выше, но работает для слоёв размерности (n,m). Изначальный класс работает только для слоёв размера (n,n).

In [None]:
class TuckerLinearAnother(torch.nn.Module):
    def __init__(self, fc_w, fc_b, rank):
        super().__init__()

        self.first_layer, self.core_layer, self.last_layer = tucker_decomposition_conv_layer(fc_w.unsqueeze(2), fc_b, rank = rank, device=device)
        self.first_layer.to(device)
        self.core_layer.to(device)
        self.last_layer.to(device)

    def forward(self, x):
        x3 = self.first_layer(x.transpose(1, 2))
        x3 = self.core_layer(x3.transpose(2, 0))
        x3 = self.last_layer(x3.transpose(2, 0))
       
        
        return (x3.transpose(1, 2).contiguous())

def another_decompose_layer(layer, rank):
    fc_w = layer.weight.data.cpu()
    fc_b = layer.bias
    factorized_layer = TuckerLinearAnother(fc_w, fc_b, rank = rank)
    return factorized_layer

In [None]:
def get_accuracy(model):
  predicted_labels = []
  true_labels = []
  for sample in test_dataset:
    true_label = sample['labels']
    input_ids = sample['input_ids'].unsqueeze(0)#.to('cpu')
    attention_mask = sample['attention_mask'].unsqueeze(0)#.to('cpu')
    token_type_ids = sample['token_type_ids'].unsqueeze(0)#.to('cpu')
    logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                  token_type_ids=token_type_ids).logits
    predicted_labels.append(logits.argmax().item())
    true_labels.append(true_label.item())
  true_positive = sum([1 if pred == real else 0 for pred, real in zip(predicted_labels, true_labels)])
  accuracy = true_positive/len(true_labels)
  return accuracy

In [None]:
save_pretrained = '/content/drive/MyDrive/models/quantized_bert'
model = AutoModelForSequenceClassification.from_pretrained(save_pretrained);
tokenizer = AutoTokenizer.from_pretrained(save_pretrained);

Изначальное качество модели ухудшается с увеличением количества слоёв, которые декомпозируем. Качество возвращается при повторном обучении в 1 эпоху.

In [None]:
def decompose_layer(layer, rank):
    fc_w = layer.weight.data.cpu()
    fc_b = layer.bias
    factorized_layer = TuckerLinear(fc_w, fc_b, rank = rank)
    return factorized_layer

rank = [20, 20]
for i in [0,1,2,3,4,5,6,7,8,9, 10, 11]:
    model.bert.encoder.layer[i].attention.self.query = decompose_layer(model.bert.encoder.layer[i].attention.self.query, rank)
    model.bert.encoder.layer[i].attention.self.key = decompose_layer(model.bert.encoder.layer[i].attention.self.key, rank)
    model.bert.encoder.layer[i].attention.self.value = decompose_layer(model.bert.encoder.layer[i].attention.self.value, rank)
    model.bert.encoder.layer[i].attention.output.dense = decompose_layer(model.bert.encoder.layer[i].attention.output.dense, rank)
    model.bert.encoder.layer[i].intermediate.dense = another_decompose_layer(model.bert.encoder.layer[i].intermediate.dense, rank)
    model.bert.encoder.layer[i].output.dense = another_decompose_layer(model.bert.encoder.layer[i].output.dense, rank)
print(get_accuracy(model.to(device)))

  del sys.path[0]


0.44857142857142857


In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
# print_size_of_model(quantized_model)

Size (MB): 385.280109


## Train the model

In [None]:
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,            # limit the total amount of checkpoints. Deletes the older checkpoints.    
    dataloader_pin_memory=False     # If true issue error while train
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 2602
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1301
  del sys.path[0]


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3391,0.223554,0.962857


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1000] due to args.save_total_limit
  del sys.path[0]
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-500] due to args.save_total_limit
  del sys.path[0]
***** Running Evaluation *****
  Num examples = 350
  Batch size = 2


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1301, training_loss=0.40496240919319876, metrics={'train_runtime': 295.9188, 'train_samples_per_second': 8.793, 'train_steps_per_second': 4.396, 'total_flos': 32476269244416.0, 'train_loss': 0.40496240919319876, 'epoch': 1.0})

In [None]:
accuracy = get_accuracy(model)
accuracy

  del sys.path[0]


0.9771428571428571

In [None]:
%%timeit
for sample in test_dataset:
  true_label = sample['labels']
  input_ids = sample['input_ids'].unsqueeze(0)
  attention_mask = sample['attention_mask'].unsqueeze(0)
  token_type_ids = sample['token_type_ids'].unsqueeze(0)
  logits = model(input_ids=input_ids, attention_mask=attention_mask, 
                 token_type_ids=token_type_ids).logits

  del sys.path[0]


1 loop, best of 5: 12.7 s per loop


In [None]:
print(f'Accuracy: {accuracy}')
print(f'Inference time on CPU: {11.5/len(true)}')