# Chapter 9: Up and Running with Tensorflow

## Creating Your First Graph and Running It in a Session

In [0]:
# A computational graph with Tensorflow. Despite the code's appearance,
# this does not perform any computation.

import tensorflow as tf

x = tf.Variable(3, name='x')
y = tf.Variable(4, name='y')
f = (x * x * y) + y + 2

In [5]:
# Running the computational graph in a Tensorflow Session.

sess = tf.Session()
sess.run(x.initializer)
sess.run(y.initializer)
result = sess.run(f)
print(result)
sess.close()

42


In [6]:
# Another way to execute the code above. It automatically closes the Session.

with tf.Session() as sess:
  x.initializer.run() # Equivalent to tf.get_default_session().run(x.initial)
  y.initializer.run()
  result = f.eval()
  print(result)

42


In [9]:
# It is possible to initialize all variables automatically.

init = tf.global_variables_initializer()

with tf.Session() as sess:
  init.run()
  result = f.eval()
  print(result)

42


In [10]:
# InteractiveSession is useful for Jupyter notebooks because it sets itself
# as the default session automatically.

sess = tf.InteractiveSession()
init.run()
result = f.eval()
print(result)
sess.close()

42


## Managing Graphs

In [11]:
# New Variables are always added to the default graph automatically.

x1 = tf.Variable(1)
x1.graph is tf.get_default_graph()

True

In [12]:
# Below is the syntax for adding a variable to a graph that is not
# the default graph.

graph = tf.Graph()
with graph.as_default():
  x2 = tf.Variable(2)
x2.graph is graph

True

In [13]:
x2.graph is tf.get_default_graph()

False

## Lifecycle of a Node Value

In [15]:
# Tensorflow automatically detects the dependency chain between nodes of
# the computation graph.

w = tf.constant(3)
x = w + 2
y = x + 5
z = x * 3

with tf.Session() as sess:
  # detects that y dependes on x, which depends on w. So it evaluates
  # w, then x, then y.
  print(y.eval())
  print(z.eval())

10
15


## Linear Regression with Tensorflow

Tensorflow operates with multidimensional arrays called <i>tensors</i>. The Python API uses NumPy's `ndarray` class to represent tensors. The previous examples used a single scalar value for a tensor. Below is an example of a Tensorflow graph which operates on a 2D array performing linear regression. Recall that the optimal parameters for Linear Regression, $\hat{\theta}$ is given by

$$ \hat{\theta} = \left( \mathbf{X}^T \cdot \mathbf{X} \right)^{-1} \cdot \mathbf{X}^T \cdot \mathbf{y} $$

where $\mathbf{X}$, $\mathbf{y}$ is the training set.

In [29]:
import numpy as np
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
m, n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

with tf.Session() as sess:
  theta_value = theta.eval()
  print(theta_value)

[[-3.71037292e+01]
 [ 4.36282694e-01]
 [ 9.40542948e-03]
 [-1.06901854e-01]
 [ 6.43611908e-01]
 [-4.06625077e-06]
 [-3.78273334e-03]
 [-4.23094332e-01]
 [-4.36462164e-01]]


## Implementing Gradient Descent

### Manually Computing the Gradients

Below is an implementation of Batch Gradient Descent where we manually compute the gradients.

In [33]:
from sklearn.preprocessing import StandardScaler

n_epochs = 1000
learning_rate = 0.01

scaled_housing_data_plus_bias = \
  StandardScaler().fit_transform(housing_data_plus_bias)

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
theta = tf.Variable(tf.random_uniform([n+1, 1], -1., 1.), name='theta')
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
gradients = (2 / m) * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - (learning_rate * gradients))

init = tf.global_variables_initializer()

with tf.Session() as sess:
  sess.run(init)
  for epoch in range(n_epochs):
    if epoch % 100 == 0:
      print('Epoch: {} MSE: {}'.format(epoch, mse.eval()))
    sess.run(training_op)

  best_theta = theta.eval()
  print(best_theta)

Epoch: 0 MSE: 7.857343673706055
Epoch: 100 MSE: 4.889010429382324
Epoch: 200 MSE: 4.844444751739502
Epoch: 300 MSE: 4.832682132720947
Epoch: 400 MSE: 4.824571132659912
Epoch: 500 MSE: 4.818718910217285
Epoch: 600 MSE: 4.814482688903809
Epoch: 700 MSE: 4.811415672302246
Epoch: 800 MSE: 4.809193134307861
Epoch: 900 MSE: 4.807580947875977
[[ 0.16158223]
 [ 0.8162949 ]
 [ 0.13743171]
 [-0.20074166]
 [ 0.23470476]
 [ 0.00246572]
 [-0.04062242]
 [-0.77622837]
 [-0.7434166 ]]


### Using autodiff

Below is an implementation of the same Batch Gradient Descent which uses Tensorflow's `gradients()` function to automatically compute the gradient of the cost function, MSE. This is useful when the function you are computing the gradient of is not a nice, analytic function like Linear Regression.

In [31]:
theta = tf.Variable(tf.random_uniform([n+1, 1], -1., 1.), name='theta')
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
gradients = tf.gradients(mse, [theta])[0]
training_op = tf.assign(theta, theta - (learning_rate * gradients))

init = tf.global_variables_initializer()

with tf.Session() as sess:
  sess.run(init)
  for epoch in range(n_epochs):
    if epoch % 100 == 0:
      print('Epoch: {} MSE: {}'.format(epoch, mse.eval()))
    sess.run(training_op)

  best_theta = theta.eval()
  print(best_theta)

Epoch: 0 MSE: 6.112096309661865
Epoch: 100 MSE: 5.016570568084717
Epoch: 200 MSE: 4.953281879425049
Epoch: 300 MSE: 4.912069320678711
Epoch: 400 MSE: 4.88232946395874
Epoch: 500 MSE: 4.860785484313965
Epoch: 600 MSE: 4.845162868499756
Epoch: 700 MSE: 4.833826541900635
Epoch: 800 MSE: 4.825592041015625
Epoch: 900 MSE: 4.819604873657227
[[ 0.860028  ]
 [ 0.81243414]
 [ 0.15590912]
 [-0.15753306]
 [ 0.1831805 ]
 [ 0.00923728]
 [-0.04210306]
 [-0.6451549 ]
 [-0.6101564 ]]


Tensorflow computes the gradients using <i>reverse-mode autodiff</i>, which is good for when there are a large number of inputs and a small number of outputs, which is generally the case for neural networks.

### Using an Optimizer

Below is an example of using an out-of-the-box Gradient Descent optimizer for the same Linear Regression task.

In [34]:
theta = tf.Variable(tf.random_uniform([n+1, 1], -1., 1.), name='theta')
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

with tf.Session() as sess:
  sess.run(init)
  for epoch in range(n_epochs):
    if epoch % 100 == 0:
      print('Epoch: {} MSE: {}'.format(epoch, mse.eval()))
    sess.run(training_op)

  best_theta = theta.eval()
  print(best_theta)

Epoch: 0 MSE: 6.642418384552002
Epoch: 100 MSE: 4.987947940826416
Epoch: 200 MSE: 4.93607234954834
Epoch: 300 MSE: 4.908919811248779
Epoch: 400 MSE: 4.887845039367676
Epoch: 500 MSE: 4.871248722076416
Epoch: 600 MSE: 4.8581132888793945
Epoch: 700 MSE: 4.847670078277588
Epoch: 800 MSE: 4.839329242706299
Epoch: 900 MSE: 4.832639217376709
[[ 0.55270267]
 [ 0.96036196]
 [ 0.16497257]
 [-0.4719435 ]
 [ 0.4585294 ]
 [ 0.01061443]
 [-0.04602391]
 [-0.44017678]
 [-0.4240848 ]]


You can also use the `tf.train.MomentumOptimizer` which converges much faster than the Gradient Descent optimizer. It takes an extra hyperparameter, `momentum`.