# Optimization and Deep Learning



## Goal of Optimization


In [None]:
%matplotlib inline
import numpy as np
import tensorflow as tf
from mpl_toolkits import mplot3d
from dl import tensorflow as dl


In [None]:
def f(x):
    return x * tf.cos(np.pi * x)

def g(x):
    return f(x) + 0.2 * tf.cos(5 * np.pi * x)

In [None]:
def annotate(text, xy, xytext):  #@save
    dl.plt.gca().annotate(text, xy=xy, xytext=xytext,
                           arrowprops=dict(arrowstyle='->'))

x = tf.range(0.5, 1.5, 0.01)
dl.set_figsize((4.5, 2.5))
dl.plot(x, [f(x), g(x)], 'x', 'risk')
annotate('min of\nempirical risk', (1.0, -1.2), (0.5, -1.1))
annotate('min of risk', (1.1, -1.05), (0.95, -0.5))

In [None]:
x = tf.range(-1.0, 2.0, 0.01)
dl.plot(x, [f(x),], 'x', 'f(x)')
annotate('local minimum', (-0.3, -0.25), (-0.77, -1.0))
annotate('global minimum', (1.1, -0.95), (0.6, 0.8))

In [None]:
x = tf.range(-2.0, 2.0, 0.01)
dl.plot(x, [x**3], 'x', 'f(x)')
annotate('saddle point', (0, -0.2), (-0.52, -5.0))

In [None]:
x, y = tf.meshgrid(tf.linspace(-1.0, 1.0, 101), tf.linspace(-1.0, 1.0, 101))
z = x**2 - y**2

ax = dl.plt.figure().add_subplot(111, projection='3d')
ax.plot_wireframe(x, y, z, **{'rstride': 10, 'cstride': 10})
ax.plot([0], [0], [0], 'rx')
ticks = [-1, 0, 1]
dl.plt.xticks(ticks)
dl.plt.yticks(ticks)
ax.set_zticks(ticks)
dl.plt.xlabel('x')
dl.plt.ylabel('y');

In [None]:
x = tf.range(-2.0, 5.0, 0.01)
dl.plot(x, [tf.tanh(x)], 'x', 'f(x)')
annotate('vanishing gradient', (4, 1), (2, 0.0))

## Exercises

1. What other challenges involved in deep learning optimization can you think of?
1. Assume that you want to balance a (real) ball on a (real) saddle.
    1. Why is this hard?
    1. Can you exploit this effect also for optimization algorithms?