In [41]:
! pip3 install datasets transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [42]:
import torch
import torch.nn as nn
import numpy as np
from warnings import filterwarnings
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorWithPadding

filterwarnings('ignore')

In [43]:
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [44]:
from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

Found cached dataset imdb (/home/yulia/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [45]:
tokenizer, model = get_model('bert')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [46]:
#tokenizer

In [47]:
encoding = tokenizer.encode_plus('Hello!', add_special_tokens=True, return_token_type_ids=False, return_tensors='pt')


In [48]:
tokenizer.decode(encoding['input_ids'][0])

'[CLS] Hello! [SEP]'

In [49]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)


train_dataset = dataset.map(tokenization, batched=True)

Loading cached processed dataset at /home/yulia/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-7b100d74d4f88c2b.arrow


dict_keys(['text', 'label', 'input_ids', 'attention_mask'])

In [51]:
train_dataset.set_format(type="torch", columns=[ 'label', 'input_ids', 'attention_mask'])

In [54]:
train_dataset[0].keys()

dict_keys(['label', 'input_ids', 'attention_mask'])

In [55]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [71]:
from torch.utils.data import Subset
np.random.seed(100)
idx = np.random.randint(len(train_dataset), size=200)
loader = DataLoader(Subset(train_dataset, idx), batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [74]:
@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

In [75]:
get_embeddings_labels(model, loader)

  0%|                                                     | 0/4 [00:00<?, ?it/s]


TypeError: Wrong key type: '5640' of type '<class 'numpy.int64'>'. Expected one of int, slice, range, str or Iterable.