In [None]:
# !pip install datasets
# !pip install transformers

In [1]:
import warnings
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

In [2]:
# Detect hardware
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")
    
# Select appropriate distribution strategy
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu) # Going back and forth between TPU and host is expensive. Better to run 128 batches on the TPU before reporting back.
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])  
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    print('Running on single GPU ', gpus[0].name)
else:
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on single GPU  /device:GPU:0
Number of accelerators:  1


2022-06-06 16:00:18.236221: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-06 16:00:18.237291: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-06 16:00:18.238013: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-06 16:00:18.240300: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [3]:
# define batch size
batch_size_per_replica = 16
batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
print('Batch size:', batch_size)

Batch size: 16


In [4]:
dataset = load_dataset("imdb")
dataset['valid'] = dataset.pop('test')
dataset.pop('unsupervised')
dataset

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [5]:
sample_train = dataset['train'].shuffle(seed=32).select(range(1601))
sample_valid = dataset['valid'].shuffle(seed=32).select(range(1600))

final_ds = sample_train.train_test_split(train_size=1600)
final_ds['valid'] = sample_valid
final_ds.pop('test')
final_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1600
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 1600
    })
})

In [6]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [7]:
model_max_len = tokenizer.model_max_length
model_max_len

512

In [8]:
def tokenize_function(examples):
    examples = [example.lower() for example in examples['text']]
    return tokenizer(examples, max_length=model_max_len, padding=True, truncation=True)

tokenized_dataset = final_ds.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_dataset

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1600
    })
    valid: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1600
    })
})

# GPU

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

tf_train_ds = tokenized_dataset['train'].to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['label'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,)

tf_valid_ds = tokenized_dataset['valid'].to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['label'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,)

In [10]:
for x in tf_train_ds.take(1):
  print(x)

({'input_ids': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[ 101, 1045, 2293, ...,    0,    0,    0],
       [ 101, 1045, 2572, ..., 2065, 2017,  102],
       [ 101, 1045, 2514, ...,    0,    0,    0],
       ...,
       [ 101, 1996, 3865, ...,    0,    0,    0],
       [ 101, 2728, 2139, ...,    0,    0,    0],
       [ 101, 4283, 2000, ...,    0,    0,    0]])>, 'attention_mask': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}, <tf.Tensor: shape=(16,), dtype=int64, numpy=array([0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0])>)


2022-06-06 16:01:25.399042: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [11]:
with strategy.scope():
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [12]:
@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)[0]
        loss = loss_object(y, logits)
        
    gradients = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
    return loss

@tf.function
def valid_step(x, y):
    logits = model(x, training=False)[0]
    loss = loss_object(y, logits)
    return loss

In [13]:
tf.keras.backend.clear_session()
with strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

num_epochs = 2
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs} ====================>')
    train_losses = []
    for x, y in tqdm(tf_train_ds, total=len(tf_train_ds)):
        train_loss = train_step(x, y)
        train_losses.append(train_loss)
    print('Train Loss:', np.mean(train_losses))
        
    valid_losses = []
    for x, y in tqdm(tf_valid_ds, total=len(tf_valid_ds)):
        valid_loss = valid_step(x, y)
        valid_losses.append(valid_loss)
    print('Valid Loss:', np.mean(valid_losses))

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2022-06-06 16:02:04.925243: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint 



100%|██████████| 100/100 [00:58<00:00,  1.71it/s]


Train Loss: 0.43121597


100%|██████████| 100/100 [00:20<00:00,  4.88it/s]


Valid Loss: 0.28809527


100%|██████████| 100/100 [00:51<00:00,  1.95it/s]


Train Loss: 0.23885876


100%|██████████| 100/100 [00:20<00:00,  4.88it/s]

Valid Loss: 0.28966174





# TPU

In [9]:
train_input_ids = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['train']['input_ids']))
train_attention_mask = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['train']['attention_mask']))
train_labels = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['train']['label']))
train_inputs = {'input_ids': train_input_ids, 'attention_mask':train_attention_mask}
train_dataset = tf.data.Dataset.zip((train_inputs, train_labels)).shuffle(512).batch(batch_size).prefetch(-1)


valid_input_ids = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['valid']['input_ids']))
valid_attention_mask = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['valid']['attention_mask']))
valid_labels = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['valid']['label']))
valid_inputs = {'input_ids': valid_input_ids, 'attention_mask':valid_attention_mask}
valid_dataset = tf.data.Dataset.zip((valid_inputs, valid_labels)).batch(batch_size).prefetch(-1)

In [23]:
with strategy.scope():
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none')
    valid_loss = tf.keras.metrics.Mean(name='valid_loss')
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [24]:
with strategy.scope():
    def compute_loss(y, logits):
        per_example_loss = loss_object(y, logits)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=batch_size)

In [25]:
def train_step(batch):
    x, y = batch
    with tf.GradientTape() as tape:
        logits = model(x, training=True)[0]
        loss = compute_loss(y, logits)
        
    gradients = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
    return loss

def valid_step(batch):
    x, y = batch
    logits = model(x, training=False)[0]
    loss = loss_object(y, logits)
    
    valid_loss.update_state(loss)

In [26]:
@tf.function
def dist_train_step(batch):
    per_replica_losses = strategy.run(train_step, args=(batch,))
    return strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)

@tf.function
def dist_valid_step(batch):
    return strategy.run(valid_step, args=(batch,))

In [27]:
tf.keras.backend.clear_session()
with strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
    
num_epochs = 2
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs} ====================>')
    train_losses = []
    for batch in tqdm(train_dataset, total=len(train_dataset)):
        x, y = batch
        y = tf.expand_dims(y, axis=-1)
        train_losses.append(dist_train_step((x, y)).numpy())
    print('Train Loss:', np.mean(train_losses))

    for batch in tqdm(valid_dataset, total=len(valid_dataset)):
        x, y = batch
        y = tf.expand_dims(y, axis=-1)
        dist_valid_step((x, y))
    print('Valid Loss:', valid_loss.result().numpy())
    valid_loss.reset_states()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i



100%|██████████| 13/13 [01:22<00:00, 10.20s/it]2022-06-06 15:58:06.685662: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 48791, Output num: 0
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1654531086.685356802","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 48791, Output num: 0","grpc_status":3}
100%|██████████| 13/13 [01:22<00:00,  6.38s/it]


Train Loss: 0.5844382


100%|██████████| 13/13 [00:14<00:00,  1.09s/it]


Valid Loss: 0.3992977


100%|██████████| 13/13 [00:10<00:00,  1.21it/s]


Train Loss: 0.29617876


100%|██████████| 13/13 [00:02<00:00,  4.51it/s]

Valid Loss: 0.31714994



