# Трансформер 2 - Улица Сезам.

> Обсудим популярные архитектуры нейронных сетей, основанные на архитектуре Трансформер.

## Hugging Face

https://huggingface.co/

## Модели

https://huggingface.co/models

In [2]:
import torch
import torch.nn as nn
import numpy as np

from warnings import filterwarnings

filterwarnings('ignore')

In [3]:
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [4]:
tokenizer, model = get_model('bert')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [6]:
encoding = tokenizer.encode_plus('Hello!', add_special_tokens=True, return_token_type_ids=False, return_tensors='pt')

encoding

{'input_ids': tensor([[ 101, 8667,  106,  102]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [7]:
tokenizer.decode(encoding['input_ids'][0])

'[CLS] Hello! [SEP]'

In [8]:
output = model(**encoding)

output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6283,  0.2166,  0.5605,  ...,  0.0136,  0.6158, -0.1712],
         [ 0.6108, -0.2253,  0.9263,  ..., -0.3028,  0.4500, -0.0714],
         [ 0.8040,  0.1809,  0.7076,  ..., -0.0685,  0.4837, -0.0774],
         [ 1.3290,  0.2360,  0.4567,  ...,  0.1509,  0.9621, -0.4841]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.7105,  0.4876,  0.9999, -0.9947,  0.9599,  0.9521,  0.9767, -0.9946,
         -0.9815, -0.6238,  0.9776,  0.9984, -0.9989, -0.9998,  0.8559, -0.9755,
          0.9895, -0.5281, -1.0000, -0.7414, -0.7056, -0.9999,  0.2901,  0.9786,
          0.9729,  0.0734,  0.9828,  1.0000,  0.8981, -0.1109,  0.2780, -0.9920,
          0.8693, -0.9985,  0.1461,  0.2067,  0.8092, -0.2430,  0.8580, -0.9585,
         -0.8130, -0.6138,  0.7961, -0.5727,  0.9737,  0.2362, -0.1194, -0.0789,
          0.0031,  0.9997, -0.9519,  0.9899, -0.9962,  0.9931,  0.9950,  0.5050,
          0.9952,  0.1090,

In [9]:
output.last_hidden_state.shape

torch.Size([1, 4, 768])

In [10]:
output.pooler_output.shape

torch.Size([1, 768])

In [11]:
tokenizer, model = get_model('roberta')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [13]:
encoding = tokenizer.encode_plus('Hello!', add_special_tokens=True, return_token_type_ids=False, return_tensors='pt')

encoding

{'input_ids': tensor([[    0, 31414,   328,     2]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [14]:
tokenizer.decode(encoding['input_ids'][0])

'<s>Hello!</s>'

In [15]:
output = model(**encoding)

output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0632,  0.0918, -0.0026,  ..., -0.0642, -0.0555,  0.0080],
         [-0.2026, -0.0124,  0.0174,  ..., -0.0176,  0.0094,  0.0870],
         [-0.1557, -0.1340,  0.2350,  ..., -0.0798, -0.1061,  0.5419],
         [-0.0534,  0.0986, -0.0293,  ..., -0.1175, -0.0659, -0.0090]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 1.0118e-02, -2.2553e-01, -2.2348e-01, -8.2207e-02,  1.1413e-01,
          2.0735e-01,  2.7447e-01, -8.3260e-02, -6.2361e-02, -1.7518e-01,
          2.1636e-01, -2.5967e-02, -9.4576e-02,  9.4475e-02, -1.3591e-01,
          4.9016e-01,  2.1106e-01, -4.6220e-01,  5.0346e-02, -2.3451e-02,
         -2.5763e-01,  8.7108e-02,  4.7654e-01,  3.5332e-01,  1.1738e-01,
          4.8578e-02, -1.3495e-01,  5.9979e-04,  1.8758e-01,  2.3595e-01,
          2.9100e-01,  6.5356e-02,  7.7346e-02,  2.3497e-01, -2.4440e-01,
          5.0584e-02, -3.1700e-01,  2.3022e-02,  2.6311e-01, -1.9050e-01,
 

In [16]:
output.last_hidden_state.shape

torch.Size([1, 4, 768])

In [17]:
output.pooler_output.shape

torch.Size([1, 768])

In [18]:
tokenizer, model = get_model('distilbert')

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [20]:
encoding = tokenizer.encode_plus('Hello!', add_special_tokens=True, return_token_type_ids=False, return_tensors='pt')

encoding

{'input_ids': tensor([[ 101, 8667,  106,  102]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [21]:
tokenizer.decode(encoding['input_ids'][0])

'[CLS] Hello! [SEP]'

In [22]:
output = model(**encoding)

output

BaseModelOutput(last_hidden_state=tensor([[[ 0.3344,  0.2363,  0.1612,  ...,  0.1000,  0.2444, -0.0667],
         [ 0.1081,  0.3777,  0.4785,  ...,  0.3207,  0.6012,  0.1848],
         [ 0.6096,  0.5275,  0.6734,  ...,  0.3253,  0.1565,  0.1588],
         [ 0.5685,  0.6846,  0.5774,  ...,  0.8333,  0.9534, -0.0596]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [23]:
output.last_hidden_state.shape

torch.Size([1, 4, 768])

## Датасеты

https://huggingface.co/datasets

In [24]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Found cached dataset imdb (C:/Users/Alex/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 57.51it/s]


In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [26]:
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [27]:
dataset['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [28]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)


In [29]:
from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [30]:
from torch.utils.data import DataLoader

In [31]:
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForSequenceClassification
from transformers import RobertaForSequenceClassification  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaForSequenceClassification
from transformers import DistilBertForSequenceClassification  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertForSequenceClassification


def get_model_sc(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertForSequenceClassification,
        'roberta': RobertaForSequenceClassification,
        'distilbert': DistilBertForSequenceClassification
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name], num_labels=2)

## Обучение (не будем запускать)

## Альтернативный вариант

In [32]:
tokenizer, model = get_model('distilbert')

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
# print(torch.cuda.get_device_name())

cuda:0


In [34]:
model = model.to(device)

In [35]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

In [36]:
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

In [37]:
from datasets import load_dataset
from torch.utils.data import Subset

dataset = load_dataset("imdb", split="train")
dataset = dataset.map(tokenization, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

np.random.seed(100)
idx = np.random.randint(len(dataset), size=200).tolist()

loader = DataLoader(Subset(dataset, idx), batch_size=50, collate_fn=data_collator, pin_memory=True, shuffle=False)

Found cached dataset imdb (C:/Users/Alex/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Loading cached processed dataset at C:\Users\Alex\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-a2bd8ebf4e0e8394.arrow


In [49]:
# ['bert', 'roberta', 'distilbert']

from datasets import load_dataset
from torch.utils.data import Subset

tokenizer, model = get_model('bert')
model = model.to(device)

from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataset = load_dataset("imdb", split="train")
dataset = dataset.map(tokenization, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

np.random.seed(100)
idx = np.random.randint(len(dataset), size=200).tolist()

loader = DataLoader(Subset(dataset, idx), batch_size=50, collate_fn=data_collator, pin_memory=True, shuffle=False)




embeddings, labels = get_embeddings_labels(model, loader)

print(embeddings.shape, labels.shape)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset imdb (C:/Users/Alex/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [39]:
torch.save(embeddings, f'bert_embeddings.pt')

In [40]:
# ['bert', 'roberta', 'distilbert']

tokenizer, model = get_model('roberta')
model = model.to(device)
from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataset = load_dataset("imdb", split="train")
dataset = dataset.map(tokenization, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

np.random.seed(100)
idx = np.random.randint(len(dataset), size=200).tolist()

loader = DataLoader(Subset(dataset, idx), batch_size=50, collate_fn=data_collator, pin_memory=True, shuffle=False)


embeddings, labels = get_embeddings_labels(model, loader)

print(embeddings.shape, labels.shape)

torch.save(embeddings, f'roberta_embeddings.pt')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset imdb (C:/Users/Alex/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Loading cached processed dataset at C:\Users\Alex\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100

torch.Size([200, 768]) torch.Size([200, 1])





In [47]:
import torch
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')



@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

# ['bert', 'roberta', 'distilbert']
device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')
tokenizer, model = get_model('distilbert')
model = model.to(device)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(Subset(dataset, idx), batch_size=50, collate_fn=data_collator, pin_memory=True, shuffle=False)
embeddings, labels = get_embeddings_labels(model, loader)

print(embeddings.shape, labels.shape)

torch.save(embeddings, f'distilbert_embeddings.pt')


Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 600kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.0kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 483kB/s]
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|      

IndexError: index out of range in self

In [46]:
max_input_id = max([batch['input_ids'].max().item() for batch in loader])
vocab_size = len(tokenizer)

print(f"Max input id: {max_input_id}")
print(f"Vocab size: {vocab_size}")


Max input id: 49069
Vocab size: 28996
