In [1]:
# it seems necessary to have this environment variable set before tensorflow is imported, or else it doesn't take effect
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import tensorflow as tf
import keras

In [2]:
# 'TF_GPU_ALLOCATOR=cuda_malloc_async'
#import os

# this seem necessary, or else we get memory allocation errors.  Like we do next might
# be better to set this directly through tf instead of indirectly as environment variable
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

# We get a lot of info/warning messages that make it hard to see results here.  This seems
# pretty normal with current tensorflow, so reduce the info/warning noise
# levels are 0=INFO, 1=WARNING, 2=ERROR, NONE
# NOTE: the tf.get_logger() direct call doesn't seem to be working for some reason, environment
# variable works
#tf.get_logger().setLevel(3)


# Check GPU Status / Availability

On linux systems you can and should use the `nvidia-smi` tool to check that the gpu is visible, is active and has drivers installed.  You can run the command from a terminal like the following cell.

I also find the following commands useful to monitor the gpu performance from the command line

```
# use watch so basic nvidia-smi redraws at top of screen each second
$ watch -n 1 nvidia-smi

# nvtop is basiclly like top for nvidia gpu
$ sudo apt install nvtop
$ nvtop

# nvitop is similar, gives about same information, but some may prefer this one
$ sudo apt install nvitop
$ nvitop
```

In [3]:
!nvidia-smi

Thu Aug  1 16:38:44 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.02              Driver Version: 555.42.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1080 Ti     Off |   00000000:00:08.0 Off |                  N/A |
| 25%   25C    P8              8W /  250W |       2MiB /  11264MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce GTX 1080 Ti     Off |   00

# GPU to TF/Keras Availability

We can check that tensorflow recognizes the presence of a GPU device as follows.

In [4]:
print('Available Devices : ', tf.config.list_physical_devices())
print('Num GPUs Available: ', len(tf.config.list_physical_devices('GPU')))

Available Devices :  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU')]
Num GPUs Available:  5


# Common Setup to Test Multi-GPU Strategy

These functions create a compiled model we will fit data to using 1 and multiple cpus
under a Mirrored Strategy.  We load and reuse the same data and same model parameter for
both model fit tests.

In [5]:
def get_compiled_model():
    # Make a simple 2-layer densely-connected neural network.
    inputs = keras.Input(shape=(784,))
    x = keras.layers.Dense(4096, activation="relu")(inputs)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(inputs)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)
    x = keras.layers.Dense(4096, activation="relu")(x)

    outputs = keras.layers.Dense(10)(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )
    return model


def get_dataset():
    batch_size = 1024
    num_val_samples = 10000

    # Return the MNIST dataset in the form of a `tf.data.Dataset`.
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

    # Preprocess the data (these are Numpy arrays)
    x_train = x_train.reshape(-1, 784).astype("float32") / 255
    x_test = x_test.reshape(-1, 784).astype("float32") / 255
    y_train = y_train.astype("float32")
    y_test = y_test.astype("float32")

    # Reserve num_val_samples samples for validation
    x_val = x_train[-num_val_samples:]
    y_val = y_train[-num_val_samples:]
    x_train = x_train[:-num_val_samples]
    y_train = y_train[:-num_val_samples]
    return (
        tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size),
    )




In [6]:
# Train the model on all available devices.
train_dataset, val_dataset, test_dataset = get_dataset()

# Performance on Single GPU

We can specify which devices to mirror in the strategy when it is created.

In [7]:
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy(['GPU:0'])
print("Number of devices: {}".format(strategy.num_replicas_in_sync))

# Open a strategy scope.
with strategy.scope():
    # Everything that creates variables should be under the strategy scope.
    # In general this is only model construction & `compile()`.
    model = get_compiled_model()

%timeit -r 5 model.fit(train_dataset, epochs=5, validation_data=val_dataset)

# Test the model on all available devices.
%timeit -r 10 model.evaluate(test_dataset)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localh

# Performance on Multiple GPUs

If you don't specify specific gpu resources to use in the mirrored strategy, then all discovered /
available gpus will be used.

In [8]:
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()
print("Number of devices: {}".format(strategy.num_replicas_in_sync))

# Open a strategy scope.
with strategy.scope():
    # Everything that creates variables should be under the strategy scope.
    # In general this is only model construction & `compile()`.
    model = get_compiled_model()

%timeit -r 5 model.fit(train_dataset, epochs=5, validation_data=val_dataset)

# Test the model on all available devices.
%timeit -r 10 model.evaluate(test_dataset)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4')
Number of devices: 5
Epoch 1/5
INFO:tensorflow:Error reported to Coordinator: OOM when allocating tensor with shape[4096,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Fill]
Traceback (most recent call last):
  File "/home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
    yield
  File "/home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py", line 323, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
  File "/home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/

ResourceExhaustedError: in user code:

    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_strategy.py:628 _call_for_each_replica
        return mirrored_run.call_for_each_replica(
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py:93 call_for_each_replica
        return _call_for_each_replica(strategy, fn, args, kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py:234 _call_for_each_replica
        coord.join(threads)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/training/coordinator.py:389 join
        six.reraise(*self._exc_info_to_raise)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/six.py:719 reraise
        raise value
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/training/coordinator.py:297 stop_on_exception
        yield
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py:323 run
        self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:757 train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:498 minimize
        return self.apply_gradients(grads_and_vars, name=name)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:604 apply_gradients
        self._create_all_weights(var_list)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:783 _create_all_weights
        self._create_slots(var_list)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/adam.py:127 _create_slots
        self.add_slot(var, 'm')
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:847 add_slot
        weight = tf_variables.Variable(
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/variables.py:262 __call__
        return cls._variable_v2_call(*args, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/variables.py:244 _variable_v2_call
        return previous_getter(
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2186 create_colocated_variable
        return next_creator(**kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/shared_variable_creator.py:69 create_new_variable
        v = next_creator(**kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2083 creator_with_resource_vars
        created = self._create_variable(next_creator, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_strategy.py:486 _create_variable
        return distribute_utils.create_mirrored_variable(
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_utils.py:311 create_mirrored_variable
        value_list = real_mirrored_creator(**kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_strategy.py:481 _real_mirrored_creator
        v = next_creator(**kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py:712 variable_capturing_scope
        v = UnliftedInitializerVariable(
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/variables.py:264 __call__
        return super(VariableMetaclass, cls).__call__(*args, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py:227 __init__
        initial_value = initial_value()
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/keras/initializers/initializers_v2.py:139 __call__
        return super(Zeros, self).__call__(shape, dtype=_get_dtype(dtype), **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/init_ops_v2.py:154 __call__
        return array_ops.zeros(shape, dtype)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:2819 wrapped
        tensor = fun(*args, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:2880 zeros
        output = fill(shape, constant(zero, dtype=dtype), name=name)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:239 fill
        result = gen_array_ops.fill(dims, value, name=name)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/ops/gen_array_ops.py:3348 fill
        _ops.raise_from_not_ok_status(e, name)
    /home/dash/.conda/envs/keras-tf-gpu/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:6862 raise_from_not_ok_status
        six.raise_from(core._status_to_exception(e.code, message), None)
    <string>:3 raise_from
        

    ResourceExhaustedError: OOM when allocating tensor with shape[4096,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Fill]
