# Introduction to TensorFlow: Multilayer Network
checked 20.02.24 GPaaß  

In [None]:
import os, sys; #sys.path.insert(0, os.path.abspath('..'));  import hlp
import numpy as np
import tensorflow as tf
from IPython.display import display, Markdown  # for formatting answers to questions
print("tensorflow version:", tf.__version__)    # check the version of tensorflow
print("python version =",sys.version_info)      # check the version of python
print("Tensorflow compute devices (CPU, GPU)")
for dv in tf.config.list_physical_devices():
    print(dv)
!nvidia-smi

The next cell defines a print function `print_mat`for tensors.

In [None]:
# @title
def print_mat(x, title="", prtDim=True, max_rows=10, max_columns=10, precision=3, doRound=True,index=None, rowNames=None, colNames=None ):
    """ use pandas display to print a dataframe
        title: to be printed
        max_rows: number or None
        max_columns: number or None
        precision: number
        doRound: True  perform rounding (avoid E notation)
        index: None  row names
        columns: None column names
    """
    import pandas as pd
    import tensorflow as tf
    import numpy as np
    with pd.option_context('display.max_rows', max_rows, 'display.max_columns', max_columns, 'display.precision',precision):
        # pd.options.display.max_columns = None
        if tf.is_tensor(x):
            x = x.numpy()
        if doRound:
            x = np.round(x,decimals=precision)
        if title!="":
            if prtDim:
                print(title,x.shape)
            else:
                print(title,x.shape)
        display(pd.DataFrame(x,index=rowNames, columns=colNames))     # use smaller font


## Data is MNIST or Fashion
Read data:
* if `use_MNIST = True` read the **MNIST** data of 28x28 grey-value images of digits in 10 classes.
* if `use_MNIST = False` read the **Fashion** data of 28x28 grey-value images of clothes in 10 classes.



**Target**: assign each $28\times28$ pixel image to one of the classes  0,....,9.

The data has the following form
* input tensor $x$ of shape 60000x28x28 with 60000 images of size 28x28 of a digit.
* output matrix $y$ of 60000x1 with 60000 observed digits.


<!--- training data *mnist.train* contains 55000 images of digits, the validation data *mnist.validation* of 5000 images and the test data *mnist.test* of 10000 images. It is retrieved by  --->

In [None]:
from tensorflow import keras
from keras import layers
from keras.layers import Dense, Flatten, Dropout
from keras.models import Sequential, clone_model
import matplotlib.pyplot as plt
import pandas as pd
use_MNIST = True

In [None]:
if use_MNIST:
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
    class_names = ['null', 'eins', 'zwei', 'drei', 'vier','fünf', 'sechs', 'sieben', 'acht', 'neun']
else:
    (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
print("x_train.shape",x_train.shape,"\ty_train.shape",y_train.shape)
print("x_test.shape",x_test.shape,"\ty_test.shape",y_test.shape)
xx_train=x_train
x_train, x_test = x_train / 255.0, x_test / 255.0

Read 5 examples and show their values.

In [None]:
itm = 4    # example to print
print("y_train["+str(itm)+"] =",class_names[y_train[itm]])
#print("x_train[itm,]=",x_train[itm,])
df = pd.DataFrame(xx_train[itm,])
pd.options.display.max_columns = None  # no column break
print_mat(df,"x_train["+str(itm)+"] =", max_columns=None, max_rows=None)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
iclass = y_train[itm]
yhot1 = np.zeros(len(class_names))
yhot1[iclass]=1
x1=xx_train[itm]
xx1 = np.array(x1, dtype='uint8') # array of 8-bits pixels
xx1 = xx1.reshape((28, 28))        # 28 x 28 array (2-dimensional array)
print("iclass[",itm,"]=",class_names[iclass])
print("yhot1[",itm,"]=",yhot1)
plt.imshow(xx1, cmap='gray')
plt.show()

In [None]:
plt.figure(figsize=(18,18))
for i in range(25):
    # define subplot
    plt.subplot(5, 5, i+1)
    # plot raw pixel data

    plt.title(class_names[y_train[i]])
    plt.imshow(xx_train[i], cmap=plt.get_cmap('gray'))
# show the figure
plt.show()

## Multi-Layer Neural Network with Tensorflow
### The Data
The data is a tensor with dimensions (60000, 28, 28).
* The **first dimension** `x_train[i,*,*]`indexes the different examples of the training data. Here a pixel matrix of the image. <br>
This is the **convention for all models**.
* The second and the third dimension indicate the rows `x_train[i,j,*]` and columns `x_train[i,*,k]` of the `i`-th pixel matrix.



### The Model
<!---This is an adapted version of the tensorflow [tutorial](https://www.tensorflow.org/versions/r0.11/tutorials/mnist/beginners/index.html). --->

We always process batches of $batchsize$ inputs-output pairs in parallel (e.g. $batchsize=64$). The example index is always the first dimension.
The model  
- flatten the pixel matrix to a vector $v$ <br>
- transform $v$ with a fully connected nonlinear layer to a hidden vector `hid1` </br>
$$ hid1 = \text{tanh}(v*W1 +b1) $$
- transform `hid1` with a fully connected linear layer, then transform by `softmax` to a probability vector `py`
 of the different classes (=digits). </br>
$$ py = \text{softmax}(hid1*W3 +b3) $$



### Loss function

* $x_i$ is the observed image, $y_i$ is the class index of the observed digit and $py_i$ the vector with all predicted class probabilities for $x_i$.
* The **loss** for a single image - label pair $(x_i,y_i)$ is defined as
$$  -  \log(p(y_i|x_i,w)) >0 $$
The loss is larger than 0 as the probability is smaller than 1.0.
$$loss(w) = \sum_{i=1}^n -  \log(p(y_i|x_i,w)) $$
* The loss is averaged over the image - label pairs of the minibatch in the training data.

### Define the Model Architecture


We use regularization by **L2-regularization** (weight decay):
* `regularizers.l2(0.001)` means that every coefficient $w_i$ in the weight matrix of the layer will add $0.001 * w_i^2$ to the total loss of the network. Hence this coefficient is pulled to 0.0.  

In [None]:
import random as python_random
def reset_seeds(num):
  """ reset random number generators """
  np.random.seed(num)
  python_random.seed(num)
  tf.random.set_seed(num+1)

In [None]:
reset_seeds(17)
nhid1 = 20      # number of hidden units layer 1: 200
l2reg = 0.000   # 0.0
batch_size = 64 # number of randomly selected elements for gradient computation

model=Sequential()
#Start defining the input tensor:
#inpMatrix = keras.Input(x_train.shape[1:])    # A shape tuple (integers), not including the batch size.

# ------ LAYER 1 ----------
model.add(Flatten(input_shape=(28, 28))) # convert 28x28 matrix to vector x

# ------ LAYER 2 ----------
model.add(Dense(units=nhid1,                                 #create layer function
                       activation='tanh',
                       kernel_regularizer=keras.regularizers.l2(l2reg),  # 0.002
                       kernel_initializer='he_normal'))

# ------ LAYER 3 ----------

model.add(Dense(units=10,
                       kernel_initializer='he_normal',
                       kernel_regularizer=keras.regularizers.l2(l2reg),    #0.002
                       activation='softmax'))

#define the model's input tensors and output tensors
#model = keras.Model(inpMatrix, y_probs)


The **regularizer** aims to reduce the magnitude of parameters: lower value $\rightarrow$ less regularization. You can evaluate the effect of different values.

 <font color='blue'>**Task 1:**</font> What are the input and output dimensions of the different layers?

In [None]:
input_dim_layer1 = -1

In [None]:
output_dim_layer1 = -1

In [None]:
output_dim_layer2 = -1

In [None]:
output_dim_layer3 = -1

In [None]:
#@title Answer
display(Markdown(
   rf"""
   `input_dim_layer1`:  =  {[batch_size,x_train.shape[1],x_train.shape[2]]} <br>
   `output_dim_layer1`  =  {[batch_size,x_train.shape[1]*x_train.shape[2]]} <br>
   `output_dim_layer2`  =  {[batch_size,nhid1]} <br>
   `output_dim_layer3`  =  {[batch_size,10]} <br>
"""))



Now we select optimizer and accuracy as monitoring value and configure the model for training.

In [None]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),  # use loss - \log(p(y_i|x_i,w))
              optimizer=keras.optimizers.SGD(learning_rate=0.1),     #stochastic gradient descent with r = learning rate
              metrics=['accuracy'])                                  # apply acc or accuracy to vaidation data
print("model.loss      = ",model.loss)
print("model.optimizer = ",model.optimizer)
print("model.metrics   = ",model.metrics)
model.summary()

### Training
We use `model.fit`, which trains the model for a number of epochs.
To fit the model, all we have to do is declare the batch size and number of epochs to train for, then pass in our training data.

You can monitor the GPU acivity in the Terninal with `watch nvidia-smi`.


In [None]:
%%time
model.fit(x_train,                           # input examples   60000 x 28 x 28
          y_train,                           # output examples  60000 x 1
          batch_size = batch_size,           # number of examples for gradient computation
          validation_data=(x_test, y_test),  # data to compute accuracy metric
          epochs=20,                         # number of passes through data
          verbose=2)                         # how much output (0-2)

### Evaluation & Prediction

`plot_history` function for plotting the history: loss and accuracy for validation

In [None]:
# @title
def plot_history(hist):
    fig, ax = plt.subplots(1, 2,figsize=(8,3.3))
    colormap = np.array(['r', 'g'])
    ax[0].title.set_text('Loss')
    ax[0].plot(hist['loss'], label='train loss')
    ax[0].plot(hist['val_loss'], label='validation loss')
    ax[0].set_ylim([0, max(max(hist['loss']), max(hist['val_loss']))])
    #ax[0].scatter(xx[:,0],xx[:,1], c=colormap[yy.astype(int)])
    ax[0].legend()
    ax[0].set_xlabel('epoch')
    #ax[0].set_ylabel('loss')
    ax[1].title.set_text('Accuracy')
    ax[1].plot(hist['accuracy'], label='train accuracy')
    ax[1].plot(hist['val_accuracy'], label='validation accuracy')
    ax[1].set_ylim([min(min(hist['accuracy']), min(hist['val_accuracy'])),1.0])
    ax[1].legend()
    ax[1].set_xlabel('epoch')

In [None]:
plot_history(model.history.history)


In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print("vali loss, vali acc ",score)

In [None]:
y_pred = model.predict(x_test[:9])
print_mat(y_pred)
y_test[:9]


 <font color='blue'>**Task 2:**</font>
 Hyperparameter Search. <br>
 Evaluate different versions of the model to find the configuration with highest accuracy. Always reset random number generator, e.g. `reset_seeds(17)`!


 * Change number of layers: e.g. `nlayer = ` 3,4
 * Change number of hidden units, e.g. 20, `nhid =` 100, 400.
 * Change the regularization, e.g. `l2reg =`0, 0.001, 0.005
 * Change the batch size, e.g. `batch_size =` 64, 512, 2048

 Generate a new line for each configuration.

 What is your **best configuration**?

In [None]:
# MNIST dataset: fill the following matrix
colnames = ['nlayer', 'nhid', 'l2reg', 'batch_size', 'val_loss', 'val_acc']
res= [
       [3, 20, 0.0, 64, 0.1436, 0.9552],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
     ]
df = pd.DataFrame(res, columns = colnames)
print(df)

<font color='blue'>**Task 3:**</font> Repeat the hypertext search for the `fashion` dataset: <br>
`use_MNIST = True`

In [None]:
# Fashio dataset: fill the following matrix
colnames = ['nlayer', 'nhid', 'l2reg', 'batch_size', 'val_loss', 'val_acc']
res= [
       [3, 20, 0.0, 64, 0.1436, 0.9552],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
       [0, 20, 0.0, 64, 0.0000, 0.0000],
     ]
df = pd.DataFrame(res, columns = colnames)
print(df)

## Advanced topics

### How to Halt Training at the Right Time With Early Stopping
Neural networks are challenging to train.

Too little training and the model is underfit; too much training and the model overfits the training dataset. Both cases result in a model that is less effective than it could be.

One approach to solving this problem is to use **early stopping**. This involves monitoring the loss on the training dataset and a validation dataset (a subset of the training set not used to fit the model). As soon as loss for the validation set starts to show signs of overfitting, the training process can be stopped.

In [None]:
nhid1 = 200
nhid2 = 50
def createModel(nhid1,nhid2,x_train):
    #Start defining the input tensor:
    inpMatrix = keras.Input(x_train.shape[1:])    # A shape tuple (integers), not including the batch size.

    v = Flatten()(inpMatrix)  # convert 28x28 matrix to vector x

    #create the layers and pass them the input tensor to get the output tensor:
    hid1 = Dense(units=nhid1,
                    activation='tanh',
                    kernel_regularizer=keras.regularizers.l2(0.000),  # 0.002
                    kernel_initializer='he_normal')(v)
    hid2 = Dense(units=nhid2,
                    activation='tanh',
                    kernel_regularizer=keras.regularizers.l2(0.000),  # 0.002
                    kernel_initializer='he_normal')(hid1)
    py = Dense(units=10,
                       kernel_initializer='he_normal',
                       kernel_regularizer=keras.regularizers.l2(0.000),    #0.002
                       activation='softmax')(hid2)

    #define the model's input tensors and output tensors
    model = keras.Model(inpMatrix, py)
    model.summary()
    return model
model2 = createModel(nhid1,nhid2,x_train)
model2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=keras.optimizers.SGD(learning_rate=0.1), #r = learning rate
              metrics=['accuracy'])  # acc or accuracy


Callbacks are executed after each epoch (i.e. pass through the data). They can also be called
`on_batch_begin` and  `on_batch_end`.
* `EarlyStoppingStop` training when a monitored quantity has stopped improving.
* `History` Callback that records events into a `History` object.
* `LambdaCallback` Callback for creating simple, custom callbacks on-the-fly.
* `LearningRateScheduler` Learning rate scheduler
* `ModelCheckpoint` Save the model after specified epochs.
* `ReduceLROnPlateau` Reduce learning rate when a metric has stopped improving.
* `TensorBoard` Enable visualizations for TensorBoard.

In [None]:
%%time
# configure early stopping
estop = keras.callbacks.EarlyStopping(monitor='val_loss',
                                     patience=3)  # Stop after this number of epochs with no improvement.
model2.fit(x_train, y_train,
           validation_data=(x_test, y_test),
           batch_size=100,
           epochs=200,
           verbose=2,
           callbacks=[estop])

In [None]:
# plot history
plot_history(model2.history.history)


### Build your own layer in Keras

You can build your own Layers in Keras as explained [here](https://www.tensorflow.org/tutorials/customization/custom_layers).

The best way to implement your own layer is extending the tf.keras.Layer class and implementing:

*    `__init__` , where you can do all input-independent initialization
*    `build`, where you know the shapes of the input tensors and can do the rest of the initialization
*    `call`, where you do the forward computation

Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in build is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified.

The main data structure you'll work with is the `Layer`. A layer encapsulates both a state (the layer's "weights") and a transformation from inputs to outputs (a "call", the layer's forward pass).

The following example is from [here](https://www.tensorflow.org/guide/keras/custom_layers_and_models). It has a state: the variables `w` and `b`.

In [None]:
from keras import layers


class Linear(layers.Layer):

    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(initial_value=w_init(shape=(input_dim, units),
                             dtype='float32'),
                             trainable=True)
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(initial_value=b_init(shape=(units,),
                             dtype='float32'),
                             trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b



In [None]:
x = tf.ones((2, 2))
linear_layer = Linear(4, 2)
y = linear_layer(x)
print(y)


### Using TensorBoard for Measurements and Visualizations
[TensorBoard](https://www.tensorflow.org/tensorboard/get_started) is a tool for providing the measurements and visualizations needed during the machine learning workflow. It enables tracking experiment metrics like loss and accuracy, visualizing the model graph, projecting embeddings to a lower dimensional space, and much more.

In [None]:
# Clear any logs from previous runs
#!rm -rf ./logs/

Define a simple Model

In [None]:
(x_train, y_train),(x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

def create_model():
    return tf.keras.models.Sequential([
        Flatten(input_shape=(28, 28)),
        Dense(512, activation='relu'),
        Dropout(0.2),
        Dense(10, activation='softmax')
    ])
model = create_model()
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
import datetime
def train_model():
  model = create_model()
  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
  logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

  model.fit(x=x_train,
            y=y_train,
            epochs=5,
            validation_data=(x_test, y_test),
            callbacks=[tensorboard_callback])


In [None]:
train_model()

In [None]:
%tensorboard --logdir logs

Open a command window, activate the python environment and change to the current directory. Start TensorBoard through the command line or within a notebook experience.
```
tensorboard --logdir logs/fit
```
The two interfaces are generally the same. In notebooks, use the `%tensorboard` line magic. On the command line, run the same command without "%".


**Flow Graph generated by Tensorboard**
![Flow Graph generated by Tensorboard](img/flowGraph.png)

**Distribution of parameter values generated by Tensorboard**
![Distribution](img/distribution.png)

**Histogram of parameter values at different epochs generated by Tensorboard**
![Histogram](img/histogram.png)