In [23]:
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import  DataLoader
from datasets import load_from_disk
import torch

import base

In [24]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [25]:
base.reset_seed(42)

In [26]:
tokenizer = BertTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2")
model = BertForSequenceClassification.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [61]:
def prepare_dataset(dataset):
    dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, return_tensors="pt", max_length=64), batched=True)
    dataset = dataset.rename_column("label", "labels")
    dataset.set_format(type='torch', columns=['input_ids', "attention_mask"], device=device)
    return dataset


In [62]:
def generate_logits(dataloader, model):
    """Generates logits for given input."""
    logits_arr = []
    for batch in dataloader:
        with torch.no_grad():
            outputs = model(batch["input_ids"],batch["attention_mask"])
            logits = outputs.logits
        logits_arr.append(logits.cpu().numpy())

    logits_arr_flat = []
    for tensor in logits_arr:
        logits_arr_flat.extend(tensor)
    return logits_arr_flat

In [63]:
train_data = load_from_disk("./data/sst2/train")
train_dataset = prepare_dataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)

In [64]:
eval_data = load_from_disk("./data/sst2/eval")
eval_dataset = prepare_dataset(eval_data)
eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False)


In [65]:
test_data = load_from_disk("./data/sst2/test")
test_dataset = prepare_dataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [66]:
train_logits = generate_logits(train_dataloader, model)
eval_logits = generate_logits(eval_dataloader, model)
test_logits = generate_logits(test_dataloader, model)

In [67]:
train_dataset = train_dataset.add_column("logits", train_logits)
train_dataset = train_dataset.remove_columns(["label_text", "token_type_ids"])
train_dataset.reset_format()

eval_dataset = eval_dataset.add_column("logits", eval_logits)
eval_dataset = eval_dataset.remove_columns(["label_text", "token_type_ids"])
eval_dataset.reset_format()

test_dataset = test_dataset.add_column("logits", test_logits)
test_dataset = test_dataset.remove_columns(["label_text", "token_type_ids"])
test_dataset.reset_format()

In [68]:
print(train_dataset[0])

{'text': 'a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films', 'labels': 1, 'input_ids': [101, 170, 20329, 117, 6276, 1105, 1921, 19920, 1231, 118, 18632, 1104, 5295, 1105, 1103, 8839, 1105, 4970, 5367, 2441, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'logits': [-4.8251633644104, 4.71889591217041]}


In [69]:
train_dataset.save_to_disk("./data/sst2/train-logits")
eval_dataset.save_to_disk("./data/sst2/eval-logits")
test_dataset.save_to_disk("./data/sst2/test-logits")

Saving the dataset (1/1 shards): 100%|██████████| 6920/6920 [00:00<00:00, 265749.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 872/872 [00:00<00:00, 114746.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1821/1821 [00:00<00:00, 179363.30 examples/s]
