# In this series of tasks, you will need to create embeddings for objects from a subset of the IMDB dataset using pre-trained models from Hugging Face.

In this task, create them using the BERT model (bert-base-cased) / RoBERTa (roberta-base) / DistilBERT (distilbert-base-cased) and use the get_embeddings_labels function from the seminar.

```python

from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

np.random.seed(100)
idx = np.random.randint(len(dataset), size=200)
```

Please verify before submission that the tensor with embeddings has the dimensions (200, 768).

In [1]:
import torch
import torch.nn as nn
import numpy as np

from warnings import filterwarnings

filterwarnings('ignore')

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [2]:
#tokenizer, model = get_model('bert')
tokenizer, model = get_model('distilbert')
#tokenizer, model = get_model('roberta')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

cuda:0


In [3]:
from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

np.random.seed(100)
idx = np.random.randint(len(dataset), size=200)

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
idx

array([ 5640, 23320, 14147, 24423, 12119,    79, 16304, 16734, 14260,
       10082, 20533, 13890,  7906,  5646, 17186, 13297, 10992, 20760,
       11492, 17724,  7738, 11152,  9723,  5769, 15453,  6230, 18818,
        5787, 18284, 21377,  3213,  6901, 22227, 12900, 10843, 12219,
       19267, 20615, 11695, 10557, 13070, 20151, 21243, 19792, 16386,
         889,  4713,  2526, 16354, 19637, 12571, 23790, 20382,  2992,
       11567,  3949,   806, 10412,  8466, 18624, 23480,  7074, 20917,
        1738, 20753,  9741, 15390,  8209,  7221,  6212,   488, 19803,
       15182, 15360,  4109,  5305, 10602,  4227, 19014,  1667, 16079,
       24970, 13527, 20975, 19305, 13488, 11444, 14123,  1797, 17223,
        6182,  3286,  5290, 11092,  6239, 13938, 24301, 20087,  7969,
       14650, 14438,  7424, 11002,  9911,  8418,  4914, 17860,  4861,
       15280,  3568, 18197, 24985, 19894,  8454,  1701, 22374,  5748,
       11559, 20075, 18142,  5939, 21369,  2626, 20723,  4535, 22801,
        4909,  9701,

In [5]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch.utils.data import Subset

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)


token_dataset = dataset.map(tokenization, batched=True)

token_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

loader = DataLoader(Subset(token_dataset, idx), batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [6]:
from tqdm import tqdm

@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

In [7]:
embeddings, labels = get_embeddings_labels(model, loader)

100%|██████████| 4/4 [00:03<00:00,  1.32it/s]


In [8]:
torch.save(embeddings, 'predictions_distilbert.pt')