In [1]:
import torch
import torch.nn as nn
import numpy as np
from warnings import filterwarnings
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorWithPadding, AutoTokenizer, BertModel, RobertaModel, DistilBertModel
from datasets import load_dataset
from torch.utils.data import Subset

# Игнорируем предупреждения
filterwarnings('ignore')

# Определяем доступное устройство для вычислений (GPU если доступно, иначе CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_model(model_name):
    """Функция для получения предобученной модели и токенизатора по имени"""
    
    # Поддерживаемые модели
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    # Словарь с именами контрольных точек для каждой модели
    checkpoint_names = {
        'bert': 'bert-base-cased', 
        'roberta': 'roberta-base',
        'distilbert': 'distilbert-base-cased'
    }
    
    # Словарь с классами моделей
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    # Возвращаем токенизатор и модель
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

def tokenization(example):
    """Функция для токенизации примеров из датасета"""
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Получаем токенизатор и модель BERT
tokenizer, model = get_model('bert')

# Переносим модель на устройство
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Загружаем датасет IMDB
dataset = load_dataset("imdb", split="train")

# Токенизируем датасет
train_dataset = dataset.map(tokenization, batched=True)
train_dataset.set_format(type="torch", columns=[ 'label', 'input_ids', 'attention_mask'])

Found cached dataset imdb (C:/Users/Alex/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Loading cached processed dataset at C:\Users\Alex\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-09c26827f4451b24.arrow


In [4]:
# Создаем коллатор данных с автоматическим заполнением
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
# Генерируем случайный набор данных для обучения
np.random.seed(100)
idx = np.random.randint(len(train_dataset), size=200).tolist()
loader = DataLoader(Subset(train_dataset, idx), batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [11]:
# Получаем эмбеддинги и метки

@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

100%|██████████| 4/4 [00:01<00:00,  2.04it/s]


(tensor([[ 0.6028,  0.1125, -0.2223,  ..., -0.1698,  0.1655,  0.0645],
         [ 0.6183,  0.0239, -0.2428,  ..., -0.1532,  0.1792,  0.1111],
         [ 0.3977,  0.1045, -0.1627,  ..., -0.0737,  0.2369,  0.1243],
         ...,
         [ 0.5241,  0.1857, -0.4136,  ..., -0.2503,  0.0959,  0.2350],
         [ 0.6121,  0.0789, -0.1658,  ..., -0.1467,  0.4310,  0.1626],
         [ 0.6343,  0.0062, -0.1479,  ..., -0.2116,  0.1337,  0.1664]]),
 tensor([[0.],
         [1.],
         [1.],
         [1.],
         [0.],
         [0.],
         [1.],
         [1.],
         [1.],
         [0.],
         [1.],
         [1.],
         [0.],
         [0.],
         [1.],
         [1.],
         [0.],
         [1.],
         [0.],
         [1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [1.],
         [0.],
         [1.],
         [0.],
         [1.],
         [1.],
         [0.],
         [0.],
         [1.],
         [1.],
         [0.],
         [0.],
         [1.],
   

In [18]:
train_embeddings, train_labels = get_embeddings_labels(model, loader)
torch.save(train_embeddings, "train.pth")
train_embeddings.shape

100%|██████████| 4/4 [00:02<00:00,  1.99it/s]


torch.Size([200, 768])

In [13]:
# save embeddings
save_path = 'embeddings.pt'
torch.save(get_embeddings_labels(model, loader), save_path)

100%|██████████| 4/4 [00:01<00:00,  2.05it/s]
