In [1]:
from mxnet import nd
from mxnet.gluon import nn
import pandas as pd
import mxnet as mx
from mxnet import profiler
import re
import time
data_ctx = mx.gpu()
model_ctx = mx.gpu()        
mx.random.seed(42, ctx=model_ctx)

In [2]:
# setting the profiler for measuring the execution time and memory usage
profiler.set_config(profile_all=False,profile_symbolic = False, profile_imperative = False,profile_memory = True, profile_api = True, aggregate_stats=True,continuous_dump=False, filename='neural_net_gpu_profile.json')

In [3]:
# loading the dataset using mx.test_utils
mnist = mx.test_utils.get_mnist()
batch_size = 100
learning_rate = 0.1

# Convert training and validation data to NDArray format
train_data_array = mx.nd.array(mnist['train_data'], ctx=data_ctx)
train_label_array = mx.nd.array(mnist['train_label'], ctx=data_ctx)
test_data_array = mx.nd.array(mnist['test_data'], ctx=data_ctx)
test_label_array = mx.nd.array(mnist['test_label'], ctx=data_ctx)

# Create an iterator with combined data
combined_data_iter = mx.io.NDArrayIter(train_data_array, train_label_array, batch_size, shuffle=True)
val_data_iter = mx.io.NDArrayIter(test_data_array, test_label_array, batch_size, shuffle=True)

In [26]:
# defining a neural net
net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(units=200, activation='relu', use_bias=True, dtype='float32', in_units=784))
    net.add(nn.Dense(units=100, activation='relu', use_bias=True, dtype='float32', in_units=200))
    net.add(nn.Dense(units=10, activation=None, use_bias=True, dtype='float32', in_units=100))

In [27]:
# Initializing the parameters for the neural net
net.initialize(mx.initializer.Uniform(), ctx=model_ctx)

In [28]:
# defining the trainer with SGD optimizer
trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': learning_rate})

In [29]:
# setting the number of epochs and metric
num_of_epochs = 10
metric = mx.metric.Accuracy()
loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()

In [30]:
def training_function():
    combined_data_iter.reset()
    for batch in combined_data_iter:
        data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=[model_ctx], batch_axis=0)
        label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=[model_ctx], batch_axis=0)
        outputs = []
        with mx.autograd.record():
            for x, y in zip(data, label):
                z = net(x)
                loss_value = loss(z, y)
                loss_value.backward()
                outputs.append(z)
        trainer.step(batch.data[0].shape[0])
        metric.update(label, outputs)
    metric.reset()

In [31]:
# running the iteration once before starting the profiler
training_function()

In [32]:
mx.nd.waitall() 

# starting the profiler
profiler.set_state('run')
start = time.time()

In [33]:
for epoch in range(num_of_epochs):
    training_function()

In [34]:
# waiting for all operations to end, then stopping the profiler
mx.nd.waitall()
end = time.time()
profiler.set_state('stop')

In [35]:
results = profiler.dumps()

In [36]:
result = results
result = result.split('\n')

In [37]:
# splitting the result into a list of lists
for i in range(len(result)):
    result[i] = result[i].split()

In [38]:
# extracting the maximum gpu and cpu memory usage and the total execution time
max_gpu_use = 0
max_cpu_use = 0
total_execution_time = 0
# traversing over the lists and trying to find the maximum gpu and cpu memory usage and the total execution time
for i in result:
    if (len(i)>=1 and i[0]=='Memory:'):
        if (i[1]=='gpu/0'):
            max_gpu_use = float(i[-2])
        elif (i[1]=='cpu/0'):
            max_cpu_use = float(i[-2])
        else: continue
    # if the length of the list 6 and the second to sixth elements are numbers, then it is a time entry
    else:
        if (len(i)>=6):
            # if it is a valid time entry, then add it to the total execution time
            if (re.match(r'^-?\d+(?:\.\d+)$', i[-4]) is not None):
                total_execution_time += float(i[-4])

if (total_execution_time==0):
    total_execution_time = (end - start)*1000

In [39]:
# Prediction
metric = mx.metric.Accuracy()
val_data_iter.reset()
for batch in val_data_iter:
    data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list = [model_ctx], batch_axis=0)
    label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list = [model_ctx], batch_axis=0)
    outputs = []
    for x in data:
        outputs.append(net(x))
    metric.update(label, outputs)
val_accuracy = metric.get()

In [40]:
print(f"Maximum GPU memory usage: {max_gpu_use} KB")
print(f"Maximum CPU memory usage: {max_cpu_use} KB")
print(f"Total execution time: {total_execution_time} milli seconds (ms)")
print(f"Validation accuracy: {val_accuracy[1]*100} %")

Maximum GPU memory usage: 188160.0 KB
Maximum CPU memory usage: 0 KB
Total execution time: 22492.034899999995 milli seconds (ms)
Validation accuracy: 97.69 %
