In [None]:
# !pip install datasets
# !pip install transformers

In [1]:
import warnings
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

2022-06-06 10:40:39.322244: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-06-06 10:40:39.322416: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Detect hardware
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")
    
# Select appropriate distribution strategy
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu) # Going back and forth between TPU and host is expensive. Better to run 128 batches on the TPU before reporting back.
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])  
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    print('Running on single GPU ', gpus[0].name)
else:
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

2022-06-06 10:40:48.302973: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-06-06 10:40:48.306054: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-06-06 10:40:48.306092: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-06 10:40:48.306119: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (70435bc77a3c): /proc/driver/nvidia/version does not exist
2022-06-06 10:40:48.309757: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

Running on TPU  ['10.0.0.2:8470']
Number of accelerators:  8


In [3]:
# define batch size
batch_size_per_replica = 16
batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
print('Batch size:', batch_size)

Batch size: 128


In [4]:
dataset = load_dataset("imdb")
dataset['valid'] = dataset.pop('test')
dataset.pop('unsupervised')
dataset

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [5]:
sample_train = dataset['train'].shuffle(seed=32).select(range(1601))
sample_valid = dataset['valid'].shuffle(seed=32).select(range(1600))

final_ds = sample_train.train_test_split(train_size=1600)
final_ds['valid'] = sample_valid
final_ds.pop('test')
final_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1600
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 1600
    })
})

In [6]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [7]:
model_max_len = tokenizer.model_max_length
model_max_len

512

In [8]:
def tokenize_function(examples):
  examples = [example.lower() for example in examples['text']]
  return tokenizer(examples, max_length=model_max_len, padding=True, truncation=True)

tokenized_dataset = final_ds.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_dataset

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label'],
        num_rows: 1600
    })
    valid: Dataset({
        features: ['attention_mask', 'input_ids', 'label'],
        num_rows: 1600
    })
})

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

tf_train_ds = tokenized_dataset['train'].to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['label'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,)

tf_valid_ds = tokenized_dataset['valid'].to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['label'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,)

In [11]:
for x in tf_train_ds.take(1):
  print(x)

({'input_ids': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[ 101, 1045, 2034, ..., 2007, 7928,  102],
       [ 101, 7543, 2028, ...,    0,    0,    0],
       [ 101, 2106, 3087, ...,    0,    0,    0],
       ...,
       [ 101, 2000, 2404, ...,    0,    0,    0],
       [ 101, 1012, 1012, ...,    0,    0,    0],
       [ 101, 2023, 3185, ...,    0,    0,    0]])>, 'attention_mask': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}, <tf.Tensor: shape=(16,), dtype=int64, numpy=array([1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1])>)


2022-06-06 10:29:05.318998: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [20]:
warnings.filterwarnings('ignore')
with strategy.scope():
  model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
  model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))

num_epochs = 2
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs} ====================>')
    train_losses = []
    for x, y in tqdm(tf_train_ds, total=len(tf_train_ds)):
        train_results = model.train_on_batch(x, y, 
                                             reset_metrics=True, 
                                             return_dict=True)
        train_losses.append(train_results['loss'])
    print('Train Loss:', np.mean(train_losses))
        
    valid_losses = []
    for x, y in tqdm(tf_valid_ds, total=len(tf_valid_ds)):
        valid_results = model.test_on_batch(x, y, 
                                             reset_metrics=True, 
                                             return_dict=True)
        valid_losses.append(valid_results['loss'])
    print('Valid Loss:', np.mean(valid_losses))

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_99', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i



100%|██████████| 100/100 [00:57<00:00,  1.75it/s]


Train Loss: 0.46231755912303923


100%|██████████| 100/100 [00:18<00:00,  5.55it/s]


Valid Loss: 0.32741664819419386


100%|██████████| 100/100 [00:53<00:00,  1.88it/s]


Train Loss: 0.22769431360065936


100%|██████████| 100/100 [00:17<00:00,  5.82it/s]

Valid Loss: 0.3345218504592776





# TPU

In [9]:
train_input_ids = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['train']['input_ids']))
train_attention_mask = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['train']['attention_mask']))
train_labels = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['train']['label']))
train_inputs = {'input_ids': train_input_ids, 'attention_mask':train_attention_mask}
train_dataset = tf.data.Dataset.zip((train_inputs, train_labels)).shuffle(512).batch(batch_size).prefetch(-1)


valid_input_ids = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['valid']['input_ids']))
valid_attention_mask = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['valid']['attention_mask']))
valid_labels = tf.data.Dataset.from_tensor_slices(tf.constant(tokenized_dataset['valid']['label']))
valid_inputs = {'input_ids': valid_input_ids, 'attention_mask':valid_attention_mask}
valid_dataset = tf.data.Dataset.zip((valid_inputs, valid_labels)).batch(batch_size).prefetch(-1)

In [12]:
with strategy.scope():
  model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
  model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))
  
num_epochs = 2
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs} ====================>')
    train_losses = []
    for x, y in tqdm(train_dataset, total=len(train_dataset)):
        train_results = model.train_on_batch(x, y, 
                                             reset_metrics=True, 
                                             return_dict=True)
        train_losses.append(train_results['loss'])
    print('Train Loss:', np.mean(train_losses))
        
    valid_losses = []
    for x, y in tqdm(valid_dataset, total=len(valid_dataset)):
        valid_results = model.test_on_batch(x, y, 
                                             reset_metrics=True, 
                                             return_dict=True)
        valid_losses.append(valid_results['loss'])
    print('Valid Loss:', np.mean(valid_losses))

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_39', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i



100%|██████████| 13/13 [01:16<00:00,  5.92s/it]
2022-06-06 10:44:04.913935: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 22656, Output num: 0
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1654512244.909995306","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 22656, Output num: 0","grpc_status":3}


Train Loss: 0.6468214484361502


100%|██████████| 13/13 [00:14<00:00,  1.09s/it]


Valid Loss: 0.4889642275296725


100%|██████████| 13/13 [00:06<00:00,  2.04it/s]


Train Loss: 0.37460579780431896


100%|██████████| 13/13 [00:05<00:00,  2.25it/s]

Valid Loss: 0.2735814612645369



