### [Simple accelerate example](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb)

In [1]:
# import os
# from accelerate.utils import write_basic_config

# write_basic_config()
# os._exit(00)

In [2]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric
from transformers import (
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

from tqdm.auto import tqdm
import datasets
import transformers

In [3]:
raw_datasets = load_dataset("glue", "mrpc")

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [5]:
raw_datasets["train"][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [6]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=10):
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])

    display(HTML(df.to_html()))


show_random_elements(raw_datasets["train"])

Unnamed: 0,sentence1,sentence2,label,idx
0,IG Farben 's 500 properties were valued at around 38-million euros ( about R300-million ) and the firm had debts totalling about 28-million .,IG Farben 's 500 properties were valued at around 38 million euros ( $ 43.63 million ) and the firm had debts totalling some 28 million euros .,equivalent,219
1,""" I can say I am being forced into exile by the world superpower , "" he said .",""" I am being forced into exile by the world 's superpower , "" Taylor said in an address videotaped at his home .",equivalent,1485
2,"On Wednesday , a man was shot near Kut as he tried to run over two marines at a checkpoint .","On Wednesday , the Marines shot an attacker near the town of Al-Kut as he tried to run over two Marines at a checkpoint .",equivalent,3931
3,"The US version will cost $ 99 for an individual licence , the same as the existing version .","Contribute 2 costs 69 for an individual license , the same as the existing version .",equivalent,1849
4,Knox County Health Department is following national Centers for Disease Control and Prevention Protocol to contain infection .,The health department spokesperson added the department is following Centers for Disease Control protocol .,equivalent,4069
5,"The second rover is scheduled for launch later this month , and both vehicles are expected to arrive at Mars in January .",The second rover is scheduled for launch on June 25 and both will arrive at Mars in January .,equivalent,2891
6,"Baer said he had concluded that lawyers for the two victims "" have shown , albeit barely ... that Iraq provided material support to bin Laden and al-Qaeda "" .","Judge Harold Baer concluded Wednesday that lawyers for the two victims "" have shown , albeit barely ... that Iraq provided material support to bin Laden and al-Qaida . """,equivalent,862
7,Total sales for the period declined 8.0 percent to $ 1.99 billion from a year earlier .,Wal-Mart said sales at stores open at least a year rose 4.6 percent from a year earlier .,not_equivalent,166
8,"That was a reversal from a loss of $ 35 million , or 20 cents , a year earlier .","Net income was 12 cents a share , compared with a net loss of $ 35 million , or 20 cents , a year earlier .",not_equivalent,1164
9,A discouraging outlook from General Electric Co. sent its share down 81 cents ( U.S. ) or 2.7 per cent to $ 29.32 .,A discouraging outlook from GE sent the company 's shares down 81 cents ( U.S. ) or 2.7 per cent to $ 29.32 .,equivalent,1568


In [7]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
tokenizer("Hello, this is one sentence!", "And this sentence goes with it")

{'input_ids': [101, 8667, 117, 1142, 1110, 1141, 5650, 106, 102, 1262, 1142, 5650, 2947, 1114, 1122, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
def tokenize_function(examples):
    outputs = tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    return outputs

In [10]:
tokenized_datasets = raw_datasets.map(
    tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"]
)

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [12]:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [13]:
tokenized_datasets["train"].features

{'labels': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [14]:
tokenized_datasets.set_format("torch")

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def create_dataloaders(train_batch_size=8, eval_batch_size=32):
    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=train_batch_size)  # type: ignore
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, batch_size=eval_batch_size  # type: ignore
    )
    return train_dataloader, eval_dataloader

In [17]:
train_dataloader, eval_dataloader = create_dataloaders()

In [18]:
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    print(batch)
    outputs = model(**batch)
    break

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 128]), 'token_type_ids': torch.Size([8, 128]), 'attention_mask': torch.Size([8, 128])}
{'labels': tensor([1, 1, 1, 1, 1, 0, 1, 1]), 'input_ids': tensor([[  101,  1220,  1127,  ...,     0,     0,     0],
        [  101,  1109,  1160,  ...,     0,     0,     0],
        [  101,   138,  9658,  ...,     0,     0,     0],
        ...,
        [  101, 25793,  1104,  ...,     0,     0,     0],
        [  101,  1124,  1163,  ...,     0,     0,     0],
        [  101,   138,  1646,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]

In [19]:
outputs

SequenceClassifierOutput(loss=tensor(0.5699, grad_fn=<NllLossBackward0>), logits=tensor([[-0.6950, -0.3316],
        [-0.6873, -0.3250],
        [-0.7036, -0.3262],
        [-0.7093, -0.3313],
        [-0.6869, -0.3289],
        [-0.6687, -0.3260],
        [-0.7050, -0.3309],
        [-0.7041, -0.3369]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [20]:
import numpy as np

o = outputs.logits.detach().numpy()
o.shape

np.sum(o, axis=1)

array([-1.0265548 , -1.0123265 , -1.0298319 , -1.0405526 , -1.0157611 ,
       -0.99465144, -1.0359266 , -1.0410428 ], dtype=float32)

In [21]:
metric = load_metric("glue", "mrpc")

predictions = outputs.logits.detach().argmax(dim=-1)
predictions

  metric = load_metric("glue", "mrpc")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


tensor([1, 1, 1, 1, 1, 1, 1, 1])

In [22]:
metric.compute(predictions=predictions, references=batch["labels"])

{'accuracy': 0.875, 'f1': 0.9333333333333333}

In [23]:
batch["labels"]

tensor([1, 1, 1, 1, 1, 0, 1, 1])

In [24]:
predictions

tensor([1, 1, 1, 1, 1, 1, 1, 1])

In [25]:
hyperparameters = {
    "learning_rate": 2e-5,
    "num_epochs": 3,
    "train_batch_size": 8,
    "eval_batch_size": 32,
    "seed": 42,
}

In [26]:
from accelerate import Accelerator


def training_function(model):
    # Initialize accelerator
    accelerator = Accelerator()

    # To have only one message (and not 8) per logs of Transformers or Datasets, we set the logging verbosity
    # to INFO for the main process only.
    if accelerator.is_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    train_dataloader, eval_dataloader = create_dataloaders(
        train_batch_size=hyperparameters["train_batch_size"],
        eval_batch_size=hyperparameters["eval_batch_size"],
    )
    # The seed need to be set before we instantiate the model, as it will determine the random head.
    set_seed(hyperparameters["seed"])

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=hyperparameters["learning_rate"])

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    num_epochs = hyperparameters["num_epochs"]
    # Instantiate learning rate scheduler after preparing the training dataloader as the prepare method
    # may change its length.
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * num_epochs,
    )

    # Instantiate a progress bar to keep track of training. Note that we only enable it on the main
    # process to avoid having 8 progress bars.
    progress_bar = tqdm(
        range(num_epochs * len(train_dataloader)),
        disable=not accelerator.is_main_process,
    )
    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        all_predictions = []
        all_labels = []

        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)

            # We gather predictions and labels from the 8 TPUs to have them all.
            all_predictions.append(accelerator.gather(predictions))
            all_labels.append(accelerator.gather(batch["labels"]))

        # Concatenate all predictions and labels.
        # The last thing we need to do is to truncate the predictions and labels we concatenated
        # together as the prepared evaluation dataloader has a little bit more elements to make
        # batches of the same size on each process.
        all_predictions = torch.cat(all_predictions)[
            : len(tokenized_datasets["validation"])  # type: ignore
        ]
        all_labels = torch.cat(all_labels)[: len(tokenized_datasets["validation"])]  # type: ignore

        eval_metric = metric.compute(predictions=all_predictions, references=all_labels)

        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)

In [28]:
from accelerate import notebook_launcher

notebook_launcher(training_function, (model,), num_processes=2)

Launching training on 2 GPUs.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/690 [00:00<?, ?it/s]

epoch 0: {'accuracy': 0.8186274509803921, 'f1': 0.8754208754208754}
epoch 1: {'accuracy': 0.8382352941176471, 'f1': 0.8896321070234113}
epoch 2: {'accuracy': 0.8431372549019608, 'f1': 0.8888888888888888}
