In [18]:
!pip install jupyter-black "rich[jupyter]"
%load_ext jupyter_black
%load_ext rich

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [19]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
)
from torch.utils.data import DataLoader

In [20]:
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets


[1;35mDatasetDict[0m[1m([0m[1m{[0m
    train: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'labels'[0m, [32m'input_ids'[0m, [32m'token_type_ids'[0m, [32m'attention_mask'[0m[1m][0m,
        num_rows: [1;36m3668[0m
    [1m}[0m[1m)[0m
    validation: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'labels'[0m, [32m'input_ids'[0m, [32m'token_type_ids'[0m, [32m'attention_mask'[0m[1m][0m,
        num_rows: [1;36m408[0m
    [1m}[0m[1m)[0m
    test: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'labels'[0m, [32m'input_ids'[0m, [32m'token_type_ids'[0m, [32m'attention_mask'[0m[1m][0m,
        num_rows: [1;36m1725[0m
    [1m}[0m[1m)[0m
[1m}[0m[1m)[0m

In [22]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
valid_dataloader = DataLoader(
    tokenized_datasets["validation"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

In [23]:
for batch in train_dataloader:
    break

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
outputs = model(**batch)
outputs


[1;35mSequenceClassifierOutput[0m[1m([0m
    [33mloss[0m=[1;35mtensor[0m[1m([0m[1;36m0.6969[0m, [33mgrad_fn[0m=[1m<[0m[1;95mNllLossBackward0[0m[39m>[0m[1;39m)[0m[39m,[0m
[39m    [0m[33mlogits[0m[39m=[0m[1;35mtensor[0m[1;39m([0m[1;39m[[0m[1;39m[[0m[1;36m0.3867[0m[39m, [0m[1;36m0.2386[0m[1;39m][0m[39m,[0m
[39m        [0m[1;39m[[0m[1;36m0.3802[0m[39m, [0m[1;36m0.2468[0m[1;39m][0m[39m,[0m
[39m        [0m[1;39m[[0m[1;36m0.3638[0m[39m, [0m[1;36m0.2169[0m[1;39m][0m[39m,[0m
[39m        [0m[1;39m[[0m[1;36m0.3883[0m[39m, [0m[1;36m0.2365[0m[1;39m][0m[39m,[0m
[39m        [0m[1;39m[[0m[1;36m0.4003[0m[39m, [0m[1;36m0.2678[0m[1;39m][0m[39m,[0m
[39m        [0m[1;39m[[0m[1;36m0.3653[0m[39m, [0m[1;36m0.1906[0m[1;39m][0m[39m,[0m
[39m        [0m[1;39m[[0m[1;36m0.4040[0m[39m, [0m[1;36m0.2355[0m[1;39m][0m[39m,[0m
[39m        [0m[1;39m[[0m[1;36m0.3977[0m[39m, [0m[1;36m