# High-level Gluon Example

In [2]:
# Parameters
EPOCHS = 10
N_CLASSES=10
BATCHSIZE = 64
LR = 0.01
MOMENTUM = 0.9
GPU = True

LOGGER_URL='msdlvm.southcentralus.cloudapp.azure.com'
LOGGER_USRENAME='admin'
LOGGER_PASSWORD='password'
LOGGER_DB='gpudata'
LOGGER_SERIES='gpu'

In [3]:
import os
from os import path
import sys
import numpy as np
import math
import mxnet as mx
from mxnet import nd, autograd
from mxnet import gluon
from utils import cifar_for_library, yield_mb, create_logger, Timer
from nb_logging import NotebookLogger, output_to, error_to
from gpumon.influxdb import log_context
import codecs

from influxdb import InfluxDBClient

In [4]:
client = InfluxDBClient(LOGGER_URL, 8086, LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB)

In [5]:
node_id = os.getenv('AZ_BATCH_NODE_ID', default='node')
task_id = os.getenv('AZ_BATCH_TASK_ID', default='gluon')
job_id = os.getenv('AZ_BATCH_JOB_ID', default='gluon')

In [6]:
logger = create_logger(client, node_id=node_id, task_id=task_id, job_id=job_id)

In [7]:
ctx = mx.gpu()

In [8]:
# sys.__stdout__ = codecs.getwriter("utf-8")(sys.__stdout__.detach())

In [9]:
# nb_teminal_logger = NotebookLogger(sys.stdout.session, sys.stdout.pub_thread, sys.stdout.name, sys.__stdout__)

In [10]:
# rst_out = output_to(nb_teminal_logger)
# rst_err = error_to(nb_teminal_logger)

INFO:gpumon.influxdb_gpu_logger:Logging GPU to Database msdlvm.southcentralus.cloudapp.azure.com
INFO:gpumon.influxdb_gpu_logger:['influxdb_gpu_logger.py', 'msdlvm.southcentralus.cloudapp.azure.com', '8086', 'admin', 'password', 'gpudata', 'gpu', '--node_id=node', '--task_id=gluon', '--job_id=gluon']


In [11]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("MXNet: ", mx.__version__)
print("Numpy: ", np.__version__)

OS:  linux
Python:  3.5.2 (default, Nov 23 2017, 16:37:01) 
[GCC 5.4.0 20160609]
MXNet:  1.0.0
Numpy:  1.13.3


In [12]:
data_path = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'cifar-10-batches-py')

In [13]:
def SymbolModule():
    sym = gluon.nn.Sequential()
    with sym.name_scope():
        sym.add(gluon.nn.Conv2D(channels=50, kernel_size=3, padding=1, activation='relu'))
        sym.add(gluon.nn.Conv2D(channels=50, kernel_size=3, padding=1))
        sym.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
        sym.add(gluon.nn.Activation('relu'))
        # Equiv to gluon.nn.LeakyReLU(0)
        sym.add(gluon.nn.Dropout(0.25))
        sym.add(gluon.nn.Conv2D(channels=100, kernel_size=3, padding=1, activation='relu'))
        sym.add(gluon.nn.Conv2D(channels=100, kernel_size=3, padding=1))
        sym.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
        sym.add(gluon.nn.Activation('relu'))
        sym.add(gluon.nn.Dropout(0.25))
        sym.add(gluon.nn.Flatten())
        sym.add(gluon.nn.Dense(512, activation='relu'))
        sym.add(gluon.nn.Dropout(0.25))
        sym.add(gluon.nn.Dense(N_CLASSES))
    return sym

In [14]:
def init_model(m):
    trainer = gluon.Trainer(m.collect_params(), 'sgd',
                            {'learning_rate': LR, 'momentum':MOMENTUM})
    criterion = gluon.loss.SoftmaxCrossEntropyLoss()
    return trainer, criterion

In [15]:
%%time
# Data into format for library
x_train, x_test, y_train, y_test = cifar_for_library(data_path, channel_first=True)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(x_train.dtype, x_test.dtype, y_train.dtype, y_test.dtype)

Preparing train set...
Preparing test set...
Done.
(50000, 3, 32, 32) (10000, 3, 32, 32) (50000,) (10000,)
float32 float32 int32 int32
CPU times: user 852 ms, sys: 768 ms, total: 1.62 s
Wall time: 1.62 s


In [16]:
%%time
sym = SymbolModule()
sym.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

CPU times: user 392 ms, sys: 404 ms, total: 796 ms
Wall time: 1.37 s


In [None]:
%%time
trainer, criterion = init_model(sym)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 268 µs


In [None]:
with Timer() as t:
    with log_context(LOGGER_URL, LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, 
                     node_id=node_id, task_id=task_id, job_id=job_id):
        # Sets training = True 
        for j in range(EPOCHS):
            train_loss = 0.0
            for data, target in yield_mb(x_train, y_train, BATCHSIZE, shuffle=True):
                # Get samples
                data = nd.array(data).as_in_context(ctx)
                target = nd.array(target).as_in_context(ctx)
                with autograd.record():
                    # Forwards
                    output = sym(data)
                    # Loss
                    loss = criterion(output, target)
                # Back-prop
                loss.backward()
                trainer.step(data.shape[0])
                train_loss += nd.sum(loss).asscalar()
            # Log
            print('Epoch %3d: loss: %5.4f'%(j, train_loss/len(x_train)))
print('Training took %.03f sec.' % t.interval)
logger('training duration', value=t.interval)

Epoch   0: loss: 1.8363
Epoch   1: loss: 1.3678
Epoch   2: loss: 1.1515
Epoch   3: loss: 0.9821
Epoch   4: loss: 0.8468
Epoch   5: loss: 0.7368


In [None]:
%%time
# Test model
n_samples = (y_test.shape[0]//BATCHSIZE)*BATCHSIZE
y_guess = np.zeros(n_samples, dtype=np.int)
y_truth = y_test[:n_samples]
c = 0
for data, target in yield_mb(x_test, y_test, BATCHSIZE):
    # Get samples
    data = nd.array(data).as_in_context(ctx)
    # Forwards
    output = sym(data)
    pred = nd.argmax(output, axis=1)
    # Collect results
    y_guess[c*BATCHSIZE:(c+1)*BATCHSIZE] = pred.asnumpy()
    c += 1

In [None]:
acc=sum(y_guess == y_truth)/len(y_guess)
print("Accuracy: ", acc)
logger('accuracy', value=acc)