In [2]:
import tensorflow as tf
import tensorflow.keras as keras
import time
import sys
import GPUtil
from timeit import default_timer as timer
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from keras.utils.np_utils import to_categorical
from functools import reduce
from tensorflow.keras.datasets import mnist
from sklearn.metrics import accuracy_score
from threading import Thread
from numba import cuda


input_shape = (1, 32, 32, 3)
output_size = 100
n_epochs = 100
gpu_performance_sampling_time = 1
gpu_performance_sampling_time_INFERENCE = 0.1

# READ FROM COMMAND LINE DEVICE, NETWORK SIZE AND BATCH SIZE
# CHECK GPU AVAILABILITY

if tf.config.list_physical_devices('GPU') == 0:
    print("GPU unavailable :(")
    sys.exit(0)

# # READ ARGS FROM COMMAND LINE
# # Raise error if correct arguments aren't given
# if len(sys.argv) != 4:
#     print("Matmul benchmark need 3 arguments:")
#     print("- Device")
#     print("- Shallow Layer dimension")
#     print("- Batch size")
#     sys.exit(1)


# dev = sys.argv[1]
# hidden_layer_list.append(int(sys.argv[2]))
# batch_size = int(sys.argv[3])
dev = 'gpu'
batch_size = 1000


"""
SET DEVICE
"""
if dev == 'cpu':
    d = '/cpu:0'
elif dev == 'gpu':
    if tf.config.list_physical_devices('GPU') == 0:
        print("GPU unavailable :(")
        sys.exit(0)
    d = '/device:GPU:0'

"""
GPU USAGE MONITOR
"""


class Monitor(Thread):

    def __init__(self, delay):
        super(Monitor, self).__init__()
        self.stopped = False
        self.delay = delay
        self.start()

    def run(self):
        while not self.stopped:
            GPUtil.showUtilization()
            time.sleep(self.delay)

    def stop(self):
        self.stopped = True


"""
Timing callback definition
"""


class TimeHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.batch_times = []
        self.epoch_times = []
        self.training_time = []
        self.training_time_start = time.time()

    def on_batch_begin(self, batch, logs={}):
        self.batch_time_start = time.time()

    def on_batch_end(self, batch, logs={}):
        self.batch_times.append(time.time() - self.batch_time_start)

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.epoch_times.append(time.time() - self.epoch_time_start)

    def on_train_end(self, batch, logs={}):
        self.training_time.append(time.time() - self.training_time_start)


"""
Data loading method
"""


def data_loading(output):
    (x_train, y_train), (x_test, y_test) = cifar100.load_data()
    # Data preprocessing: I have to rescale and flatten all the images
    shape = (1, 32, 32, 3)
    # One-hot encoding
    y_train = to_categorical(y_train, num_classes=output)
    y_test = to_categorical(y_test, num_classes=output)
    return (x_train, y_train), (x_test, y_test)


"""
Model definition method
"""


def model_def(input_size, output):
    model = Sequential()
    model.add(Conv2D(64, (4,4), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(1, 1)))
    model.add(Conv2D(128, (4,4), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(1, 1)))
    model.add(Conv2D(256, (4,4), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(1, 1)))
    model.add(Flatten())
    model.add(Dense(1000, activation='relu', input_shape=(input_size,)))
    model.add(Dense(output_size, activation='softmax'))
    model.add(Dense(hidden_layer[i], activation='relu'))
    loss = keras.losses.CategoricalCrossentropy()
    optim = keras.optimizers.SGD(learning_rate=lr, momentum=0.05)
    metrics = ["accuracy"]
    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model


def main():
    (X_train, Y_train), (X_test, Y_test) = data_loading(output_size)
    nn = model_def(hidden_layer_list, input_size, output_size)
    nn.summary()

    """
    Training
    """
    time_callback = TimeHistory()
    print("\nTraining...\n")
    nn = model_def(hidden_layer_list, input_size, output_size)
    monitor = Monitor(gpu_performance_sampling_time)     # GPU MONITOR
    begin = timer()  # Duration of the whole fit() method run
    nn.fit(X_train, Y_train, epochs=n_epochs, batch_size=batch_size,
           callbacks=[time_callback], validation_split=0.3, verbose=0)
    training_time = timer() - begin  # Duration of the whole fit() method run
    monitor.stop()                                       # GPU MONITOR
    training_time_sum_over_batches = sum(time_callback.batch_times)
    time_per_sample = training_time_sum_over_batches/((len(X_train)//batch_size)
                                                      * batch_size)
    sample_per_second = 1./time_per_sample

    """
    Testing In-Sample
    """
    print("\nTesting in-sample...\n")
    monitor = Monitor(gpu_performance_sampling_time)     # GPU MONITOR
    begin = timer()  # Inference time on training set
    pred = nn.predict(X_train).argmax(1)
    testing_time_insample = timer() - begin  # Inference time on training set
    monitor.stop()                                       # GPU MONITOR
    accuracy = accuracy_score(Y_train.argmax(1),
                              pred, normalize=False)/len(X_train)
    time_per_sample_test_insample = testing_time_insample/len(X_train)
    sample_per_second_test_insample = 1./time_per_sample_test_insample

    """
    Testing Out-of-Sample
    """
    print("\nTesting out-of-sample...\n")
    monitor = Monitor(gpu_performance_sampling_time)     # GPU MONITOR
    begin = timer()  # Inference time on training set
    pred = nn.predict(X_test).argmax(1)
    testing_time_outofsample = timer() - begin  # Inference time on training set
    monitor.stop()                                       # GPU MONITOR
    accuracy_score(Y_test.argmax(1),
                   pred, normalize=False)/len(X_test)
    time_per_sample_test_outofsample = testing_time_outofsample/len(X_test)
    sample_per_second_test_outofsample = 1./time_per_sample_test_outofsample

    """
    free gpu memory
    """
    device = cuda.get_current_device()
    device.reset()

    print("\nAccurcay over test set: %s\n" % accuracy)

    print("\nTraining the model took %s seconds\n" % training_time)

    print("\nTraining duration summing batch processing time: %s seconds\n"
          % training_time_sum_over_batches)

    print("\nSample processing time during training: %s sample/seconds\n"
          % sample_per_second)

    print("\nTesting the model in-sample took %s seconds\n" % testing_time_insample)

    print("\nSample processing time during inference in-sample: %s sample/seconds\n"
          % sample_per_second_test_insample)

    print("\nTesting the model out-of-sample took %s seconds\n"
          % testing_time_outofsample)

    print("\nSample processing time during inference out-of-sample: %s sample/seconds\n"
          % sample_per_second_test_outofsample)

    return 0


if __name__ == "__main__":
    with tf.device(dev):
        main()

2022-01-29 15:56:14.591297: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-29 15:56:14.593072: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-29 15:56:14.593290: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-29 15:56:14.593436: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                7850      
                                                                 
 dense_1 (Dense)             (None, 10)                110       
                                                                 
Total params: 7,960
Trainable params: 7,960
Non-trainable params: 0
_________________________________________________________________

Training...

| ID | GPU | MEM |
------------------
|  0 |  0% | 96% |


2022-01-29 15:56:16.141456: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


| ID | GPU | MEM |
------------------
|  0 | 24% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 36% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 39% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 37% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 38% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 36% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 39% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 36% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 35% | 97% |

Testing in-sample...

| ID | GPU | MEM |
------------------
|  0 | 29% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 15% | 97% |

Testing out-of-sample...

| ID | GPU | MEM |
------------------
|  0 |  5% | 97% |

Accurcay over test set: 0.9003833333333333


Training the model took 9.735613036900759 seconds


Training duration summing batch processing time: 5.542431592941284 seconds


Sample processing time during training: 10825.573395694166 sample/seconds


Testing the mod