# Preliminaries

In [51]:
import tensorflow as tf
import numpy as np

In [92]:
config = {
    'learning_rate':0.01,
    'batch_size':256,
    'num_epochs':3
}

# Working with tensors

Note that tensors cannot be changed and variables (which are special types of tensors) can be changed. Therefore, the weights of a neural network are usually stored in some variables.

In [12]:
t = tf.constant([[1,2,3],[4,5,6]])
print(t)
print(f'This {t.shape} tensor is of data type {t.dtype} (but of type {type(t)}) and is stored on {t.device}')
print(f'The second column of this tensor is {t[:,1]}')

tf.Tensor(
[[1 2 3]
 [4 5 6]], shape=(2, 3), dtype=int32)
This (2, 3) tensor is of data type <dtype: 'int32'> (but of type <class 'tensorflow.python.framework.ops.EagerTensor'>) and is stored on /job:localhost/replica:0/task:0/device:CPU:0
The second column of this tensor is [2 5]


Converting to a numpy array

In [8]:
t.numpy()

array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)

# Working with variables

In [14]:
v = tf.Variable([[1,2,3],[4,5,6]])
print(v)
print(f'This {v.shape} variable is of data type {v.dtype} (but of type {type(v)}) and is stored on {v.device}')
print(f'The second column of this variable is {v[:,1]}')

<tf.Variable 'Variable:0' shape=(2, 3) dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)>
This (2, 3) variable is of data type <dtype: 'int32'> (but of type <class 'tensorflow.python.ops.resource_variable_ops.ResourceVariable'>) and is stored on /job:localhost/replica:0/task:0/device:CPU:0
The second column of this variable is [2 5]


The value of a variable can be changed

In [15]:
v.assign([[7,7,7],[8,8,8]])

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=int32, numpy=
array([[7, 7, 7],
       [8, 8, 8]], dtype=int32)>

In [16]:
print(v)

<tf.Variable 'Variable:0' shape=(2, 3) dtype=int32, numpy=
array([[7, 7, 7],
       [8, 8, 8]], dtype=int32)>


Notice that the new value should be compatible with the original shape of the corresponding variable! (Otherwise, you'll get a ValueError)

In [17]:
v.assign([[9,9]])

ValueError: ignored

Another way of changing a variable is to add something to it

In [18]:
v.assign_add([[1,1,1],[1,1,1]])
print(v)

<tf.Variable 'Variable:0' shape=(2, 3) dtype=int32, numpy=
array([[8, 8, 8],
       [9, 9, 9]], dtype=int32)>


In [19]:
v.assign_sub([[1,1,1],[1,1,1]])
print(v)

<tf.Variable 'Variable:0' shape=(2, 3) dtype=int32, numpy=
array([[7, 7, 7],
       [8, 8, 8]], dtype=int32)>


# Gradient Tape

In [27]:
x = tf.constant(2, dtype=tf.float32) # if the data type is int32, TF cannot compute the gradient

In [30]:
with tf.GradientTape(persistent=False) as tape: # we want to compute just one gradient, so persistent=False
    tape.watch(x)
    y = x**2 + 2*x

$\dfrac{\partial y}{\partial x}=\dfrac{\partial (x^2+2x)}{\partial x}=2x+2$<br><br>
$x=2 \implies 2x+2 = 6$

In [31]:
print(tape.gradient(y,x)) # dy/dx

tf.Tensor(6.0, shape=(), dtype=float32)


In [32]:
z = tf.constant(3, dtype=tf.float32)

In [37]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x)
    tape.watch(z)
    y = x**2 + 2*x + z**3

Note that $\dfrac{\partial y}{\partial x}$ is the same.<br><br>
We have: $\dfrac{\partial y}{\partial z}=\dfrac{\partial(x^2+2x+z^3)}{\partial z}=3z^2$<br><br>
$z=3 \implies 3z^2 = 3(9)=27$

In [38]:
print(tape.gradient(y,x)) # dy/dx
print(tape.gradient(y,z)) # dy/dz

tf.Tensor(6.0, shape=(), dtype=float32)
tf.Tensor(27.0, shape=(), dtype=float32)


# Define a Model

## Glorot Uniform (Xavier) Initialization

Glorot Uniform: Draws samples from a uniform distribution within `[-limit, limit]`, where `limit = sqrt(6 / (fan_in + fan_out))` (`fan_in` is the number of input units in the weight tensor and `fan_out` is the number of output units). (Source: TF Website)

For example, each weight that connects an input node to a hidden node has `fan_in` of the number of input nodes and `fan_out` of the number of hidden nodes. (Source: [Here](https://jamesmccaffrey.wordpress.com/2017/06/21/neural-network-glorot-initialization/))

```for-each input-hidden weight```<br>
```  variance = 6.0 / (fan-in +fan-out)```<br>
```  stddev = sqrt(variance)```<br>
```  weight = Uniform(-stddev, stddev)```<br>
```end-for```

The goal of Xavier Initialization is to initialize the weights such that the variance of the activations are the same across every layer. This constant variance helps prevent the gradient from exploding or vanishing. (Source: [Here](https://datascience.stackexchange.com/questions/102733/what-is-glorot-uniform))

In [93]:
class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        initializer = tf.keras.initializers.GlorotUniform()
        self.w = tf.Variable(initializer(shape=(1,)))
        self.b = tf.Variable(initializer(shape=(1,)))
    def call(self, x: tf.Tensor) -> tf.Tensor:
        y = (self.w)*x + self.b
        return y

In [94]:
model = MyModel()

In [95]:
model(np.zeros((1,)))

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-1.5666912], dtype=float32)>

# Optimization

## Making a Dummy Dataset

In [96]:
features = np.random.rand(2000, 1)
features.shape

(2000, 1)

In [97]:
true_outputs = np.random.rand(2000,1)
true_outputs.shape

(2000, 1)

In [98]:
opt_dataset = tf.data.Dataset.from_tensor_slices((features, true_outputs))

## Training Loop

In [99]:
def optimize_given_a_batch(features, true_outputs, model, loss_fn, optimizer):

    with tf.GradientTape(persistent=False) as tape:
        model_outputs = model(features, training=True)
        loss = loss_fn(true_outputs, model_outputs)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return (model_outputs, loss)

In [100]:
def optimize_on_a_dataset(dataset, model, loss_fn, optimizer, epoch_number):
    num_batches = len(dataset)
    total_loss = 0

    for batch_num, (features, true_outputs) in enumerate(dataset):
        (model_outputs, batch_loss) = optimize_given_a_batch(features, true_outputs, model, loss_fn, optimizer)
        batch_loss = batch_loss.numpy()
        total_loss += batch_loss
        avg_loss = total_loss/((batch_num+1)*config['batch_size'])
        print(f'\r Epoch {epoch_number} - batch loss: {batch_loss} - avg loss: {avg_loss}', end=' ')

In [101]:
%%time
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.optimizers.SGD(config['learning_rate'])
for epoch_number in range(1, config['num_epochs']+1):
  print(f'\rEpoch {epoch_number + 1} - ', end=' ')
  optimize_on_a_dataset(opt_dataset, model, loss_fn, optimizer, epoch_number)

 Epoch 3 - batch loss: 0.1513349413871765 - avg loss: 0.00032449884124537034 CPU times: user 31 s, sys: 927 ms, total: 32 s
Wall time: 41.3 s


## Prediction

In [91]:
model(np.random.rand(5,1))

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[0.50818604],
       [0.5004108 ],
       [0.49030745],
       [0.5047164 ],
       [0.5082203 ]], dtype=float32)>

# Eager Execution vs. Graph Execution

The way the above code runs is the default (through Eager Execution). We usually use Eager Execution for the development phase, and when we are done (e.g., we have optimized our parameters), we use the Graph Execution which is a lot faster. <br>
In this way, Tensorflow makes a computational graph the first time it does the operations and uses this graph instead of the Python code for the next times.<br>
Notice that the computational graph can be used to employ the model on embedded devices (even those not supporting Tensorflow)!<br>
To execute a function using the Graph mode, we need to use a `@tf.function` decorator.


In [106]:
@tf.function
def optimize_given_a_batch(features, true_outputs, model, loss_fn, optimizer):

    with tf.GradientTape(persistent=False) as tape:
        model_outputs = model(features, training=True)
        loss = loss_fn(true_outputs, model_outputs)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return (model_outputs, loss)

In [107]:
model = MyModel()

Notice the difference in the training time!

In [108]:
%%time
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.optimizers.SGD(config['learning_rate'])
for epoch_number in range(1, config['num_epochs']+1):
  print(f'\rEpoch {epoch_number + 1} - ', end=' ')
  optimize_on_a_dataset(opt_dataset, model, loss_fn, optimizer, epoch_number)

 Epoch 3 - batch loss: 0.15135984122753143 - avg loss: 0.00032448865403175334 CPU times: user 9.93 s, sys: 1.06 s, total: 11 s
Wall time: 13.4 s
