In [1]:
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import  DataLoader
from datasets import load_from_disk
import torch

import base

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [3]:
base.reset_seed(42)

In [51]:
tokenizer = BertTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2")
model = BertForSequenceClassification.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2", num_labels=2)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [33]:
def prepare_dataset(dataset):
    dataset = dataset.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', return_tensors="pt", max_length=64), batched=True)
    dataset = dataset.rename_column("label", "labels")
    dataset.set_format(type='torch', columns=['input_ids', "attention_mask"], device=device)
    return dataset


In [39]:
def generate_logits(dataloader, model):
    """Generates logits for given input."""
    logits_arr = []
    for batch in dataloader:
        with torch.no_grad():
            outputs = model(batch["input_ids"],batch["attention_mask"])
            logits = outputs.logits
        logits_arr.append(logits.cpu().numpy())

    logits_arr_flat = []
    for tensor in logits_arr:
        logits_arr_flat.extend(tensor)
    return logits_arr_flat

In [53]:
train_data = load_from_disk("./data/sst2/train")
train_dataset = prepare_dataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)

In [54]:
eval_data = load_from_disk("./data/sst2/eval")
eval_dataset = prepare_dataset(eval_data)
eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False)


In [55]:
test_data = load_from_disk("./data/sst2/test")
test_dataset = prepare_dataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [56]:
train_logits = generate_logits(train_dataloader, model)
eval_logits = generate_logits(eval_dataloader, model)
test_logits = generate_logits(test_dataloader, model)

In [57]:
train_dataset = train_dataset.add_column("logits", train_logits)
train_dataset = train_dataset.remove_columns(["idx", "token_type_ids"])
train_dataset.reset_format()

eval_dataset = eval_dataset.add_column("logits", eval_logits)
eval_dataset = eval_dataset.remove_columns(["idx", "token_type_ids"])
eval_dataset.reset_format()

test_dataset = test_dataset.add_column("logits", test_logits)
test_dataset = test_dataset.remove_columns(["idx", "token_type_ids"])
test_dataset.reset_format()

In [58]:
print(train_dataset[0])

{'sentence': 'as a pale successor ', 'labels': 0, 'input_ids': [101, 1112, 170, 4554, 5714, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'logits': [3.70131254196167, -3.784115791320801]}


In [59]:
train_dataset.save_to_disk("./data/sst2/train-logits")
eval_dataset.save_to_disk("./data/sst2/eval-logits")
test_dataset.save_to_disk("./data/sst2/test-logits")

Saving the dataset (0/1 shards):   0%|          | 0/60614 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/872 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6735 [00:00<?, ? examples/s]