In [7]:
# do not forget to swapoff -a
import numpy as np       # linear algebra
import pylab as pl       # plots
import tensorflow as tf  # now we are actually using it 

In [8]:
# choose wisely
(train, label_train), (test, label_test) = tf.keras.datasets.mnist.load_data()
#(train, label_train), (test, label_test) = tf.keras.datasets.fashion_mnist.load_data()

# make sure your data is floating point
test = test.astype(np.float32)
train = train.astype(np.float32)

# print shapes
print(train.shape, label_train.shape)
print(test.shape, label_test.shape)

((60000, 28, 28), (60000,))
((10000, 28, 28), (10000,))


In [9]:
# here we can subsample in the data set
num_test, num_train = 10000, 60000
test, label_test = test[:num_test], label_test[:num_test]
train, label_train = train[:num_train], label_train[:num_train]

# print shapes
print(train.shape, label_train.shape)
print(test.shape, label_test.shape)

((60000, 28, 28), (60000,))
((10000, 28, 28), (10000,))


In [10]:
# forget about the image shape
train = train.reshape((-1, 784))
test = test.reshape((-1, 784))

In [11]:
# now define the session graph
X_tf = tf.placeholder(shape=[None, 784], dtype=tf.float32) # for each image we have 784 pixels
Y_tf = tf.placeholder(shape=[None],      dtype=tf.int64)   # for each image we have one scalar label

A_tf = tf.Variable(np.zeros((784, 10)), dtype=tf.float32)
a_tf = tf.Variable(np.zeros((10)),      dtype=tf.float32)
F_tf = tf.add(tf.matmul(X_tf, A_tf), a_tf)

# map scalar labels onto one-hot encoded vectors
L_tf = tf.one_hot(Y_tf, 10, dtype=tf.float32)

# compute crossentrop_softmax_with logits
M_tf = tf.reduce_max(F_tf)
norm_tf = tf.expand_dims(tf.reduce_sum(tf.exp(F_tf-M_tf), axis=1), 1)
log_norm_tf = tf.where(norm_tf < 0, tf.zeros_like(norm_tf), tf.log(norm_tf))
log_softmax_F_tf =  (F_tf-M_tf)-log_norm_tf 
loss_tf = tf.reduce_mean(tf.reduce_sum(-L_tf * log_softmax_F_tf, axis=1)) 

# let us define the non-differentiable accuracy as metric
correctly_predicted_tf = tf.equal(tf.argmax(L_tf, axis=1), 
                                  tf.argmax(tf.nn.softmax(F_tf-M_tf), axis=1))
metric_tf = tf.reduce_mean(tf.cast(correctly_predicted_tf, tf.float32))

# define the optimizer
optimizer = tf.train.GradientDescentOptimizer(1E-6)
step_tf = optimizer.minimize(loss_tf)

In [12]:
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())
    
    num_iterations, print_every, batch_size = 2**14, 2**10, 2**5
    for iteration in range(num_iterations):
        
        indices = np.random.choice(train.shape[0], batch_size, replace=False)
        X, Y = train[indices], label_train[indices]
        
        sess.run(step_tf, feed_dict={X_tf: X, Y_tf: Y})
        
        if iteration % print_every == print_every-1:
            loss, metric = sess.run([loss_tf, metric_tf], feed_dict={X_tf: train, Y_tf: label_train})
            print("train loss and metric:",loss, metric)
            
    loss, metric = sess.run([loss_tf, metric_tf], feed_dict={X_tf: test, Y_tf: label_test})
    print("test loss and metric:", loss, metric)

('train loss and metric:', 0.38203013, 0.89428335)
('train loss and metric:', 0.34408128, 0.90433335)
('train loss and metric:', 0.3276412, 0.9087)
('train loss and metric:', 0.31346628, 0.91118336)
('train loss and metric:', 0.30585456, 0.91595)
('train loss and metric:', 0.30383998, 0.91533333)
('train loss and metric:', 0.29677233, 0.91686666)
('train loss and metric:', 0.29354337, 0.91796666)
('train loss and metric:', 0.29127175, 0.91815)
('train loss and metric:', 0.28918633, 0.91898334)
('train loss and metric:', 0.2856266, 0.92141664)
('train loss and metric:', 0.28520247, 0.92018336)
('train loss and metric:', 0.28161505, 0.92223334)
('train loss and metric:', 0.28331536, 0.92111665)
('train loss and metric:', 0.27989212, 0.92286664)
('train loss and metric:', 0.27751896, 0.92356664)
('test loss and metric:', 0.28106183, 0.9215)
