In [1]:
import GPUtil
from threading import Thread
import time
import tensorflow as tf
import tensorflow.keras as keras
import time
import sys
from timeit import default_timer as timer
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from keras.utils.np_utils import to_categorical
from functools import reduce
from tensorflow.keras.datasets import mnist
from sklearn.metrics import accuracy_score
from numba import cuda

In [2]:
deviceIDs = GPUtil.getAvailable(order = 'first', limit = 1, maxLoad = 0.5, maxMemory = 0.5,
                                includeNan=False, excludeID=[], excludeUUID=[])
deviceIDs

[0]

In [3]:
GPUs = GPUtil.getGPUs()
GPUtil.getAvailability(GPUs, maxLoad = 0.5, maxMemory = 0.5, includeNan=False, excludeID=[], excludeUUID=[])

[1]

In [4]:
class Monitor(Thread):
    def __init__(self, delay):
        super(Monitor, self).__init__()
        self.stopped = False
        self.delay = delay # Time between calls to GPUtil
        self.start()

    def run(self):
        while not self.stopped:
            GPUtil.showUtilization()
            time.sleep(self.delay)

    def stop(self):
        self.stopped = True

In [5]:
input_size = 28*28
output_size = 10
n_epochs = 100
gpu_performance_sampling_time = 1
gpu_performance_sampling_time_INFERENCE = 0.1


# READ FROM COMMAND LINE DEVICE, NETWORK SIZE AND BATCH SIZE

# CHECK GPU AVAILABILITY

if tf.config.list_physical_devices('GPU') == 0:
    print("GPU unavailable :(")
    sys.exit(0)


hidden_layer_list = []  # This list is read from the settings file

dev = 'gpu'
hidden_layer_list.append(1024)
batch_size = int(1000)

# SET DEVICE

if dev == 'cpu':
    d = '/cpu:0'
elif dev == 'gpu':
    if tf.config.list_physical_devices('GPU') == 0:
        print("GPU unavailable :(")
        sys.exit(0)
    d = '/device:GPU:0'


# Timing callback definition
class TimeHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.batch_times = []
        self.epoch_times = []
        self.training_time = []
        self.training_time_start = time.time()

    def on_batch_begin(self, batch, logs={}):
        self.batch_time_start = time.time()

    def on_batch_end(self, batch, logs={}):
        self.batch_times.append(time.time() - self.batch_time_start)

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.epoch_times.append(time.time() - self.epoch_time_start)

    def on_train_end(self, batch, logs={}):
        self.training_time.append(time.time() - self.training_time_start)


# Data loading method
def data_loading(output):
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    # Data preprocessing: I have to rescale and flatten all the images
    shape = (28, 28)
    shape_l = reduce(lambda a, b: a*b, shape)
    x_train = x_train.reshape((-1, shape_l)) / 255.
    x_test = x_test.reshape((-1, shape_l)) / 255.
    # One-hot encoding
    y_train = to_categorical(y_train, num_classes=output)
    y_test = to_categorical(y_test, num_classes=output)
    return (x_train, y_train), (x_test, y_test)


# Model defintion method
def model_def(hidden_layer, input, output):
    model = Sequential()
    for i in range(len(hidden_layer)+1):
        if i == 0:
            model.add(Dense(hidden_layer[i], activation='relu',
                      input_shape=(input_size,)))
        elif i == len(hidden_layer):
            model.add(Dense(output_size, activation='softmax'))
        else:
            model.add(Dense(hidden_layer[i], activation='relu'))
    loss = keras.losses.CategoricalCrossentropy()
    optim = keras.optimizers.SGD(learning_rate=0.01, momentum=0.05)
    metrics = ["accuracy"]
    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model


def main():
    (X_train, Y_train), (X_test, Y_test) = data_loading(output_size)
    nn = model_def(hidden_layer_list, input_size, output_size)
    nn.summary()

    
    # TRAINING

    time_callback = TimeHistory()
    print("\nTraining...\n")
    nn = model_def(hidden_layer_list, input_size, output_size)
    monitor = Monitor(gpu_performance_sampling_time)
    begin = timer()  # Duration of the whole fit() method run
    nn.fit(X_train, Y_train, epochs=n_epochs, batch_size=batch_size,
           callbacks=[time_callback], validation_split=0.3, verbose=0)
    training_time = timer() - begin  # Duration of the whole fit() method run
    monitor.stop()
    training_time_sum_over_batches = sum(time_callback.batch_times)
    time_per_sample = training_time_sum_over_batches/((len(X_train)//batch_size)
                                                      * batch_size)
    sample_per_second = 1./time_per_sample
    
    
    
    # TESTING IN-SAMPLE

    print("\nTesting in-sample...\n")
    # Evaluate the model in-sample
    monitor = Monitor(gpu_performance_sampling_time_INFERENCE)  ## GPU MONITOR
    begin = timer()  # Inference time on training set
    pred = nn.predict(X_train).argmax(1)
    testing_time_insample = timer() - begin  # Inference time on training set
    monitor.stop()                                    ## GPU MONITOR
    accuracy_score(Y_train.argmax(1),
                   pred, normalize=False)/len(X_train)
    time_per_sample_test_insample = testing_time_insample/len(X_train)
    sample_per_second_test_insample = 1./time_per_sample_test_insample

    # TESTING OUT-OF-SAMPLE

    print("\nTesting out-of-sample...\n")
    # Evaluate the model out-of-sample
    monitor = Monitor(gpu_performance_sampling_time_INFERENCE)  ## GPU MONITOR
    begin = timer()  # Inference time on training set
    pred = nn.predict(X_test).argmax(1)
    testing_time_outofsample = timer() - begin  # Inference time on training set
    monitor.stop()                                    ## GPU MONITOR
    # test_accuracy_out_of_sample = accuracy_score(Y_test.argmax(1),
    #                                              pred, normalize=False)/len(X_test)
    accuracy_score(Y_test.argmax(1),
                   pred, normalize=False)/len(X_test)
    time_per_sample_test_outofsample = testing_time_outofsample/len(X_test)
    sample_per_second_test_outofsample = 1./time_per_sample_test_outofsample
    
    """
    free gpu memory
    """
    device = cuda.get_current_device()
    device.reset()

    print("\nTraining the model took %s seconds\n" % training_time)

    print("Sample processing time during training: %s sample/second\n"
          % sample_per_second)

    print("Training duration summing batch processing time: %s second\n"
          % training_time_sum_over_batches)

    print("Testing the model in-sample took %s second\n" % testing_time_insample)

    print("Sample processing time during inference in-sample: %s sample/second\n"
          % sample_per_second_test_insample)

    print("Testing the model out-of-sample took %s second\n" % testing_time_outofsample)

    print("Sample processing time during inference out-of-sample: %s sample/second"
          % sample_per_second_test_outofsample)

    return 0


if __name__ == "__main__":
    with tf.device(dev):
        main()

2022-01-28 19:21:33.511481: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 19:21:33.539362: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 19:21:33.539565: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 19:21:33.541068: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              803840    
                                                                 
 dense_1 (Dense)             (None, 10)                10250     
                                                                 
Total params: 814,090
Trainable params: 814,090
Non-trainable params: 0
_________________________________________________________________

Training...

| ID | GPU | MEM |
------------------
|  0 |  0% | 96% |


2022-01-28 19:21:35.111638: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


| ID | GPU | MEM |
------------------
|  0 | 21% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 39% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 38% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 39% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 38% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 38% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 38% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 33% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 40% | 97% |

Testing in-sample...

| ID | GPU | MEM |
------------------
|  0 | 40% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 23% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 15% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 14% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 13% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 13% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 14% | 97% |
| ID | GPU | MEM |
------------------
|  0 | 15% | 97% |
| ID | G