In [1]:
import tensorflow as tf
from tensorflow import keras

In [14]:
tensor = tf.constant([
  [1.0, 2.0, 3.0],
  [4.0, 5.0, 6.0]
])

print(tensor.shape, tensor.dtype)
print(tensor[:, 1:])
print(tensor[..., 1, tf.newaxis])
print(tensor[..., 1, None]) # identical (tf.newaxis == None)

(2, 3) <dtype: 'float32'>
tf.Tensor(
[[2. 3.]
 [5. 6.]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[2.]
 [5.]], shape=(2, 1), dtype=float32)
tf.Tensor(
[[2.]
 [5.]], shape=(2, 1), dtype=float32)


In [17]:
# immutable tensors
t1 = tensor
t2 = tf.constant([
  [7.0, 7.0],
  [7.0, 7.0],
  [7.0, 7.0],
])

print(t1 @ t2) # matrix multiplication
print(tf.transpose(t1)) # transpose

# mutable tensor
v1 = tf.Variable([
  [1., 2., 3.],
  [4., 5., 6.],
  [7., 8., 9.],
])

v1.assign(tf.transpose(v1))
v1[1, 1].assign(42.)
v1.scatter_nd_update(indices=([0, 0], [2, 2]), updates=(100., 200.))
print(v1)

tf.Tensor(
[[ 42.  42.]
 [105. 105.]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[1. 4.]
 [2. 5.]
 [3. 6.]], shape=(3, 2), dtype=float32)
<tf.Variable 'Variable:0' shape=(3, 3) dtype=float32, numpy=
array([[100.,   4.,   7.],
       [  2.,  42.,   8.],
       [  3.,   6., 200.]], dtype=float32)>


In [25]:
# Load some data before getting on plane lol
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(60000, 28, 28) (10000, 28, 28) (60000,) (10000,)


In [30]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
housing = fetch_california_housing()
X, y = housing.data, housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)
print(X_train.shape, X_val.shape, X_test.shape)

(11610, 8) (3870, 8) (5160, 8)


In [42]:
def huber_loss(y_true, y_pred, delta=1.):
  errors = tf.abs(y_pred - y_true)
  low_error = errors < delta
  squared_error = delta * tf.square(errors) / 2
  linear_error = delta * (errors - 0.5)
  return tf.where(low_error, squared_error, linear_error)

class HuberMetric(keras.metrics.Metric):
  def __init__(self, delta=1.0, **kwargs):
    super().__init__(**kwargs)
    self.delta = delta
    self.count = tf.Variable(0.)
    self.sum = tf.Variable(0.)
  
  def update_state(self, y_true, y_pred, sample_weight=None):
    errors = huber_loss(y_true, y_pred, self.delta)
    self.sum.assign_add(tf.reduce_sum(errors))
    self.count.assign_add(tf.cast(tf.size(errors), tf.float32))
  
  def result(self):
    return self.sum / self.count
  
  def get_config(self):
    base_config = super().get_config()
    return {**base_config, 'delta': self.delta}

y_true = tf.constant([20., 30., 40., 50.])
y_pred = tf.constant([20.5, 35., 39.25, 53.])

print(huber_loss(y_true, y_pred))

metric = HuberMetric()
metric.update_state(y_true[:2], y_pred[:2])
print(metric.result())
metric.update_state(y_true[2:], y_pred[2:])
print(metric.result())

tf.Tensor([0.125   4.5     0.28125 2.5    ], shape=(4,), dtype=float32)
tf.Tensor(2.3125, shape=(), dtype=float32)
tf.Tensor(1.8515625, shape=(), dtype=float32)


In [43]:
model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=X_train.shape[1:]))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(30, activation='elu', kernel_initializer='he_normal'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(30, activation='elu', kernel_initializer='he_normal'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(30, activation='elu', kernel_initializer='he_normal'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1))

model.compile(optimizer=keras.optimizers.SGD(0.035, 0.9, True), loss=huber_loss, metrics=[HuberMetric()])

model.fit(X_train, y_train, epochs=256, validation_data=(X_val, y_val), callbacks=[
  # keras.callbacks.TensorBoard(get_run_log_dir()),
  # keras.callbacks.ModelCheckpoint(save_dir, save_best_only=True),
  keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True),
  keras.callbacks.ReduceLROnPlateau(factor=0.8,patience=5),
])


Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256
Epoch 36/256
Epoch 37/256
Epoch 38/256
Epoch 39/256
Epoch 40/256
Epoch 41/256
Epoch 42/256


<keras.callbacks.History at 0x2929c7940>

In [47]:
from sklearn.metrics import mean_absolute_error

pred = model.predict(X_test)
print(mean_absolute_error(y_test, pred))
print(y_test[12], pred[12])

0.4242804723101253
0.788 [0.783056]


In [54]:
# Custom layer
class CustomDense(keras.layers.Layer):
  def __init__(self, units, activation=None, **kwargs):
    super().__init__(**kwargs)
    self.units = units
    self.activation = keras.activations.get(activation)
  
  def build(self, batch_input_shape):
    self.kernel = self.add_weight(
      name='kernel', shape=(batch_input_shape[-1], self.units),
      initializer='he_normal',
    )
    self.bias = self.add_weight(name='bias', shape=[self.units], initializer='zeros')
    super().build(batch_input_shape) # must be at the end
  
  def call(self, X):
    return self.activation((X @ self.kernel) + self.bias)
  
  def compute_output_shape(self, batch_input_shape):
    return tf.TensorShape(batch_input_shape.as_list()[:-1] + [self.units])
  
  def get_config(self):
    base = super().get_config()
    return {**base, 'units': self.units, 'activation': keras.activations.serialize(self.activation)}

In [99]:
hidden = 2
units = 100

model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=X_train.shape[1:]))
model.add(keras.layers.BatchNormalization())
for _ in range(hidden):
  model.add(CustomDense(units, activation='elu'))
  model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1))

model.compile(optimizer=keras.optimizers.SGD(0.035, 0.9, True), loss=huber_loss, metrics=[HuberMetric()])

model.fit(X_train, y_train, epochs=256, validation_data=(X_val, y_val), callbacks=[
  # keras.callbacks.TensorBoard(get_run_log_dir()),
  # keras.callbacks.ModelCheckpoint(save_dir, save_best_only=True),
  keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True),
  keras.callbacks.ReduceLROnPlateau(factor=0.8,patience=5),
])

Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256


<keras.callbacks.History at 0x2cee75bd0>

In [94]:
pred = model.predict(X_test)
print(mean_absolute_error(y_test, pred))
for i in range(10):
  print(y_test[i], pred[i])

0.40852880441154066
2.405 [2.5110328]
1.975 [1.6943629]
3.151 [2.9167361]
1.875 [1.8150289]
1.893 [2.1726897]
1.265 [1.3826244]
1.5 [2.0641282]
1.335 [1.2278967]
0.875 [0.9539348]
0.863 [0.88381755]


In [136]:
class CustomModelWithCustomLoss(keras.models.Model):
  def __init__(self, input_units, n_hidden=5, units=30, **kwargs):
    super().__init__(**kwargs)
    self.hidden = [keras.layers.Dense(units, activation='selu', kernel_initializer='lecun_normal') for _ in range(n_hidden)]
    self.input_batch = keras.layers.BatchNormalization()
    self.batch = [keras.layers.BatchNormalization() for _ in range(n_hidden)]
    self.out = keras.layers.Dense(1)
    print(input_units)
    self.reconstruction = keras.layers.Dense(input_units)
  
  def call(self, inputs):
    Z = inputs
    Z = self.input_batch(Z)
    scaled_input = Z
    for i in range(len(self.hidden)):
      Z = self.hidden[i](Z)
      Z = self.batch[i](Z)
    recon = self.reconstruction(Z)
    self.add_loss(tf.reduce_mean(tf.square(recon - scaled_input)))

    return self.out(Z)

customModel = CustomModelWithCustomLoss(X_train.shape[-1], n_hidden=3, units=10)

customModel.compile(optimizer=keras.optimizers.SGD(0.035, 0.9, True), loss=keras.losses.mean_squared_error, metrics=[keras.metrics.mean_squared_error])

customModel.fit(X_train, y_train, epochs=256, validation_data=(X_val, y_val), callbacks=[
  keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
  keras.callbacks.ReduceLROnPlateau(factor=0.8,patience=10),
])

pred = customModel.predict(X_test)
print(mean_absolute_error(y_test, pred))


8
Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256
Epoch 36/256
Epoch 37/256
Epoch 38/256
Epoch 39/256
Epoch 40/256
Epoch 41/256
Epoch 42/256
Epoch 43/256
Epoch 44/256
Epoch 45/256
Epoch 46/256
Epoch 47/256
Epoch 48/256
Epoch 49/256
Epoch 50/256
Epoch 51/256
Epoch 52/256
Epoch 53/256
Epoch 54/256
Epoch 55/256
Epoch 56/256
Epoch 57/256
Epoch 58/256
Epoch 59/256
Epoch 60/256
0.49030395756424305


In [137]:
pred = model.predict(X_test)
print(mean_absolute_error(y_test, pred))
for i in range(10):
  print(y_test[i], pred[i])

0.44318227836202273
2.405 [2.5989137]
1.975 [1.5719242]
3.151 [3.117066]
1.875 [2.016601]
1.893 [1.997782]
1.265 [1.4025056]
1.5 [1.9752915]
1.335 [1.2818793]
0.875 [0.9925835]
0.863 [1.0024114]


In [139]:
def f(x, y):
  return (x**2) + (3*(y**3))

w1, w2 = tf.Variable(10.), tf.Variable(6.)

with tf.GradientTape() as tape:
  z = f(w1, w2)
  print(z)

gradients = tape.gradient(z, [w1, w2])
print(gradients)

tf.Tensor(748.0, shape=(), dtype=float32)
[<tf.Tensor: shape=(), dtype=float32, numpy=20.0>, <tf.Tensor: shape=(), dtype=float32, numpy=324.0>]


In [140]:
w1, w2 = tf.Variable(10.), tf.Variable(6.)

with tf.GradientTape() as tape:
  z = f(w1, w2)
  print(z)

gradients1 = tape.gradient(z, [w1, w2])
# should throw runtime 
gradients2 = tape.gradient(z, [w1, w2])
print(gradients)

tf.Tensor(748.0, shape=(), dtype=float32)


RuntimeError: A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)

In [142]:
w1, w2 = tf.Variable(10.), tf.Variable(6.)

with tf.GradientTape(persistent=True) as tape:
  z = f(w1, w2)
  print(z)

gradients1 = tape.gradient(z, [w1, w2])
# should throw runtime 
gradients2 = tape.gradient(z, [w1, w2])

print(gradients1)
print(gradients2)

del tape

tf.Tensor(748.0, shape=(), dtype=float32)
[<tf.Tensor: shape=(), dtype=float32, numpy=20.0>, <tf.Tensor: shape=(), dtype=float32, numpy=324.0>]
[<tf.Tensor: shape=(), dtype=float32, numpy=20.0>, <tf.Tensor: shape=(), dtype=float32, numpy=324.0>]


In [177]:
# Custom training loop time

import numpy as np

# start with a model
# (doesn't need to be compiled since we are defining our own training loop)
l2_reg = keras.regularizers.l2(0.05)
model = keras.models.Sequential([
  keras.layers.BatchNormalization(),
  keras.layers.Dense(
    30,
    activation='elu',
    kernel_initializer='he_normal',
    kernel_regularizer=l2_reg,
  ),
  keras.layers.BatchNormalization(),
  keras.layers.Dense(
    30,
    activation='elu',
    kernel_initializer='he_normal',
    kernel_regularizer=l2_reg,
  ),
  keras.layers.BatchNormalization(),
  keras.layers.Dense(1, kernel_regularizer=l2_reg),
])

# function to randomly sample a batch for training
def random_batch(X, y, batch_size=32):
  idx = np.random.randint(len(X), size=batch_size)
  return X[idx], y[idx]

# what to print during training epoch
def print_status_bar(iteration, total, loss, metrics=None):
  metrics = ' - '.join(["{}: {:.4f}".format(m.name, m.result()) for m in [loss] + (metrics or [])])
  end = '' if iteration < total else '\n'
  print('\r{}/{} - '.format(iteration, total) + metrics, end=end)

# Actual loop
n_epochs = 12
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = keras.optimizers.Nadam(learning_rate=0.01)
# optimizer = keras.optimizers.SGD(learning_rate=0.035, momentum=0.9, nesterov=True)
loss_fn = keras.losses.mean_squared_error
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.MeanAbsoluteError()]

for epoch in range(1, n_epochs + 1):
  print('Epoch {}/{}'.format(epoch, n_epochs))
  for step in range(1, n_steps + 1):
    X_batch, y_batch = random_batch(X_train, y_train, batch_size=batch_size)
    
    with tf.GradientTape() as tape:
      y_pred = model(X_batch)
      main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
      loss = tf.add_n([main_loss] + model.losses)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    mean_loss(loss)

    for metric in metrics:
      metric(y_batch, y_pred)
    print_status_bar(step * batch_size, len(y_train), mean_loss, metrics)
  print_status_bar(len(y_train), len(y_train), mean_loss, metrics)
  for metric in metrics:
    metric.reset_state()



Epoch 1/12
11610/11610 - mean: 41183.3438 - mean_absolute_error: 19.34435
Epoch 2/12
11610/11610 - mean: 20595.3906 - mean_absolute_error: 0.9142
Epoch 3/12
11610/11610 - mean: 13732.5566 - mean_absolute_error: 0.9188
Epoch 4/12
11610/11610 - mean: 10301.2842 - mean_absolute_error: 0.9411
Epoch 5/12
11610/11610 - mean: 8242.3613 - mean_absolute_error: 0.9175
Epoch 6/12
11610/11610 - mean: 6869.6895 - mean_absolute_error: 0.9223
Epoch 7/12
11610/11610 - mean: 5889.1738 - mean_absolute_error: 0.9165
Epoch 8/12
11610/11610 - mean: 5153.7622 - mean_absolute_error: 0.9142
Epoch 9/12
11610/11610 - mean: 4582.1865 - mean_absolute_error: 0.9263
Epoch 10/12
11610/11610 - mean: 4125.2012 - mean_absolute_error: 0.9168
Epoch 11/12
11610/11610 - mean: 3751.1396 - mean_absolute_error: 0.9040
Epoch 12/12
11610/11610 - mean: 3439.3057 - mean_absolute_error: 0.9155


In [178]:
pred = model.predict(X_test)
print(mean_absolute_error(y_test, pred))
for i in range(10):
  print(y_test[i], pred[i])

0.9132545991188019
2.405 [2.0760326]
1.975 [2.0759225]
3.151 [2.0759225]
1.875 [2.0759225]
1.893 [2.0759225]
1.265 [2.0759225]
1.5 [2.0759225]
1.335 [2.0759225]
0.875 [2.0849552]
0.863 [2.0759225]


In [179]:
def cube(x):
  return x**3

tf_cube = tf.function(cube)

print(tf_cube(10))


tf.Tensor(1000, shape=(), dtype=int32)


In [288]:
class LayerNorm(keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
  
  def build(self, batch_input_shape):
    shape = batch_input_shape[-1:]
    self.alpha = self.add_weight(name='alpha', shape=shape, initializer='ones')
    self.beta = self.add_weight(name='beta', shape=shape, initializer='zeros')
    super().build(batch_input_shape)

  def call(self, inputs):
    mean, var = tf.nn.moments(inputs, axes=0, keepdims=True)
    std = tf.sqrt(var)
    return tf.multiply(self.alpha, ((inputs - mean) / (std + 1e-6))) + self.beta


In [290]:
from sklearn.model_selection import train_test_split

(X_train, y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [298]:
model = keras.models.Sequential([
  keras.layers.Flatten(input_shape=X_train.shape[1:]),
  LayerNorm(),
  keras.layers.Dense(300, activation='elu', kernel_initializer='he_normal'),
  LayerNorm(),
  keras.layers.Dense(300, activation='elu', kernel_initializer='he_normal'),
  LayerNorm(),
  keras.layers.Dense(300, activation='elu', kernel_initializer='he_normal'),
  LayerNorm(),
  keras.layers.Dense(10, activation='softmax'),
])

model.compile(optimizer=keras.optimizers.SGD(0.035, momentum=0.9, nesterov=True), loss=keras.losses.sparse_categorical_crossentropy, metrics=[keras.metrics.sparse_categorical_crossentropy])

model.fit(X_train, y_train, epochs=256, validation_data=(X_val, y_val), callbacks=(
  keras.callbacks.EarlyStopping(patience=30, restore_best_weights=True),
  keras.callbacks.ReduceLROnPlateau(factor=0.8, patience=5),
))

Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256

KeyboardInterrupt: 

In [299]:
from sklearn.metrics import confusion_matrix

pred = model.predict(X_test)
print(confusion_matrix(y_test, np.argmax(pred, axis=1)))

[[830   1  19  22   5   4 110   0   9   0]
 [  7 976   1  10   2   0   3   0   1   0]
 [ 19   2 833  15  83   0  47   0   1   0]
 [ 25   6  14 900  41   1   9   0   4   0]
 [  4   1  85  27 845   0  34   0   4   0]
 [  0   0   0   1   0 958   1  27   0  13]
 [125   1  90  31  92   0 652   0   9   0]
 [  0   0   0   0   0   9   0 969   1  21]
 [  5   0   4   4   4   3   5   3 972   0]
 [  0   0   0   0   0  10   1  46   0 943]]


In [316]:
model = keras.models.Sequential([
  keras.layers.Flatten(input_shape=X_train.shape[1:]),
  LayerNorm(),
  keras.layers.Dense(300, activation='elu', kernel_initializer='he_normal'),
  LayerNorm(),
  keras.layers.Dense(300, activation='elu', kernel_initializer='he_normal'),
  LayerNorm(),
  keras.layers.Dense(10, activation='softmax'),
])

In [319]:
def random_batch(X, y, batch_size):
  idx = np.random.randint(len(X), size=batch_size)
  return X[idx], y[idx]

epochs = 12
batch_size = 64
n_steps = len(X_train) // batch_size
loss_fn = keras.losses.sparse_categorical_crossentropy
metric = keras.metrics.sparse_categorical_accuracy
mean = keras.metrics.Mean()
optimizer = keras.optimizers.SGD(learning_rate=0.00875, momentum=0.9, nesterov=True)

for epoch in range(1, epochs + 1):
  for step in range(1, n_steps + 1):
    X_batch, y_batch = random_batch(X_train, y_train, batch_size)
    with tf.GradientTape() as tape:
      pred = model(X_batch)
      loss = tf.reduce_mean(loss_fn(y_batch, pred))
    mean(loss)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    end = ''
    if step == n_steps:
      end = '\n'
    print('\r epoch: {} mean loss: {}, {}%'.format(epoch, mean.result(), step / n_steps * 100), end=end)
  print('validation loss: {:.4f}, validation accuracy: {:.4f}'.format(tf.reduce_mean(loss_fn(y_val, model(X_val))), tf.reduce_mean(metric(y_val, model(X_val)))))
  mean.reset_state()


 epoch: 1 mean loss: 0.04829869419336319, 100.0%75248933145%%%
validation loss: 0.4333, validation accuracy: 0.8931
 epoch: 2 mean loss: 0.040302641689777374, 100.0%5248933145%%%%
validation loss: 0.4460, validation accuracy: 0.8945
 epoch: 3 mean loss: 0.0411611869931221, 100.0%775248933145%%%
validation loss: 0.4432, validation accuracy: 0.8949
 epoch: 4 mean loss: 0.03848792612552643, 100.0%75248933145%%%%
validation loss: 0.4567, validation accuracy: 0.8937
 epoch: 5 mean loss: 0.03656065836548805, 100.0%75248933145%%%
validation loss: 0.4629, validation accuracy: 0.8960
 epoch: 6 mean loss: 0.031425293534994125, 100.0%248933145%%%%%
validation loss: 0.4843, validation accuracy: 0.8934
 epoch: 7 mean loss: 0.034133028239011765, 100.0%5248933145%%%
validation loss: 0.4930, validation accuracy: 0.8942
 epoch: 8 mean loss: 0.03184564411640167, 100.0%75248933145%%%%
validation loss: 0.4901, validation accuracy: 0.8923
 epoch: 9 mean loss: 0.029081635177135468, 100.0%248933145%%%%
valid

In [320]:
pred = model.predict(X_test)
print(confusion_matrix(y_test, np.argmax(pred, axis=1)))

[[822   3  14  25   9   1 116   0  10   0]
 [  6 975   1  12   2   1   2   0   1   0]
 [ 24   1 787  13  95   1  77   1   1   0]
 [ 24   8  11 888  32   3  28   0   6   0]
 [  3   3  78  35 829   1  48   0   3   0]
 [  1   0   0   1   0 958   2  19   3  16]
 [119   2  75  30  67   2 693   0  12   0]
 [  1   0   0   0   0  23   0 933   1  42]
 [  5   1   2   5   4   3  10   4 966   0]
 [  1   0   0   0   0  12   1  28   0 958]]
