In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import mean_absolute_error

In [None]:
# f' = 0.26 * (x**2 + y**2) - 0.48*x*y
# e = (f' - f) ** 2 = (0.26 * (x ** 2 + y ** 2) - 0.48 * x * y - f) ** 2
# de/dx = 2 * f' * (0.52*x - 0.48*y)
# de/dy = de/dx
#noise_epsilon = lambda : np.random.randn()/3
noise_epsilon = lambda :0
ff = lambda x, y: 10 * x**2 + 0.005 * y**2 
df_dx = lambda x, y: 20*x + noise_epsilon()
df_dy = lambda x, y: 0.01*y + noise_epsilon()

x_start = -5.0
y_start = -50
iterations = 300
learning_rate = 0.0333

In [None]:
# f' = 0.26 * (x**2 + y**2) - 0.48*x*y
# e = (f' - f) ** 2 = (0.26 * (x ** 2 + y ** 2) - 0.48 * x * y - f) ** 2
# de/dx = 2 * f' * (0.52*x - 0.48*y)
# de/dy = de/dx
#noise_epsilon = lambda : np.random.randn()/3
noise_epsilon = lambda :0
ff = lambda x, y: x**2  -  y**2  
df_dx = lambda x, y: 2*x + noise_epsilon()
df_dy = lambda x, y: -2*y + noise_epsilon()

x_start = -5.0
y_start = 0.0002
iterations = 300
learning_rate = 0.0333

In [None]:
fig = plt.figure(figsize=[20,12])
ax = fig.gca(projection='3d')
x = y = np.arange(-10.0, 10, 0.1)
y = np.arange(-10, 10, 0.1)
X, Y = np.meshgrid(x, y)
zs = np.array([ff(x,y) for x,y in zip(np.ravel(X), np.ravel(Y))])
Z = zs.reshape(X.shape)
Gx, Gy = np.gradient(Z) # gradients with respect to x and y
G = (Gx**2.0+Gy**2.0)**.5  # gradient magnitude
N = G/G.max()  # normalize 0..1

ax.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm, antialiased=False, shade=False, 
                rstride=5, cstride=1, linewidth=0)
ax.view_init(elev=30., azim=70)



# Stochastic Gradient Descent

In [None]:
class SGD():
    def __init__(self, lr=0.001, x=None, y=None):
        self.lr = lr
        if x==None or y==None:
            self.x = np.random.rand()*10-5
            self.y = np.random.rand()*10-5
        else:
            self.x = x
            self.y = y
    def step(self, lr = None):
        if not lr:
            lr = self.lr
        # derivative
        f = ff(self.x, self.y)
        dx = df_dx(self.x, self.y)
        dy = df_dy(self.x, self.y)
        self.x = self.x - lr*dx
        self.y = self.y - lr*dy
        return [self.x, self.y, dx, dy]
        

In [None]:
np.random.seed(655324)
opt = SGD(x=x_start, y=y_start)
errors=[ff(x_start, y_start)]
xs,ys, dxs,dys= [x_start],[y_start],[],[]
sns.set_context("talk")
for epochs in range(iterations):
    x, y, dx, dy = opt.step(lr=learning_rate)
    xs.append(x)
    ys.append(y)
    dxs.append(dx)
    dys.append(dy)
    errors.append(ff(x,y))
plt.figure(figsize=[18,6])
plt.plot(errors)
plt.title("Error evolution over time. Minimum error obtained in {0} iterations: {1}".format(len(errors), min(errors)))
plt.xlabel("time (iterations)")
plt.ylabel("error")
plt.show()

errors_sgd = errors
xs_sgd = xs
ys_sgd = ys

### Spatial evolution

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(xs)
plt.title("X parameter evolution")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(ys)
plt.title("Y parameter evolution")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(xs, ys)
plt.title("x/y evolution")
plt.xlabel("x")
plt.show()

### Dynamic evolution

In [None]:
vel_xs=np.diff(xs)
vel_ys=np.diff(ys)
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(np.abs(vel_xs[0:100]))
plt.title("X parameter velocity (momentum; $v_x$)")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(np.abs(vel_ys[0:100]))
plt.title("Y parameter velocity (momentum; $v_y$)")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(np.sqrt(np.array(vel_xs[0:100])**2 + np.array(vel_ys[0:100])**2))
plt.title("Absolute velocity ($\sqrt{v_x^2 + v_y^2}$)")
plt.xlabel("x")
plt.show()

# Stochastic Gradient Descent with Momentum

In [None]:
class SGD_momentum():
    def __init__(self, lr=0.001, beta=0.9, x=None, y=None):
        self.lr = lr
        if x == None or y == None:
            self.x = np.random.rand()*10-5
            self.y = np.random.rand()*10-5
        else:
            self.x = x
            self.y = y
        self.beta = beta
        self.vx = 0
        self.vy = 0
        
    def step(self, lr = None, beta=None):
        if type(lr) == type(None):
            lr = self.lr
        if type(beta) == type(None):
            beta = self.beta
        f = ff(self.x, self.y)
        dx = df_dx(self.x, self.y)
        dy = df_dy(self.x, self.y)
        
        self.vx = beta * self.vx + lr * dx
        self.vy = beta * self.vy + lr * dy
        self.x += - self.vx
        self.y += - self.vy
            
        return [self.x, self.y, dx, dy, self.vx, self.vy]



In [None]:
np.random.seed(655324)
xs,ys,vel_xs, vel_ys, dxs,dys= [x_start],[y_start],[],[],[],[]

opt = SGD_momentum(x=x_start, y=y_start)
errors=[ff(x_start, y_start)]
for epochs in range(iterations):
    x, y, dx, dy, vel_x, vel_y= opt.step(lr=learning_rate, beta=.9)
    vel_xs.append(vel_x)
    vel_ys.append(vel_y)
    xs.append(x)
    ys.append(y)
    dxs.append(dx)
    dys.append(dy)
    errors.append(ff(x,y))
plt.figure(figsize=[18,6])
plt.plot(errors)
plt.title("Error evolution over time. Minimum error obtained in {0} iterations: {1}".format(len(errors), min(errors)))
plt.xlabel("time (iterations)")
plt.ylabel("error")
plt.show()

errors_momentum = errors
xs_momentum = xs
ys_momentum = ys

### Spatial evolution 

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(xs[0:100])
plt.title("X parameter evolution")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(ys[0:100])
plt.title("Y parameter evolution")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(xs[0:100], ys[0:100])
plt.title("x/y evolution")
plt.xlabel("x")
plt.show()

### Dynamic evolution 

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(np.abs(vel_xs[0:100]))
plt.title("X parameter velocity ($v_x$)")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(np.abs(vel_ys[0:100]))
plt.title("Y parameter velocity ($v_y$)")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(np.sqrt(np.array(vel_xs[0:100])**2 + np.array(vel_ys[0:100])**2))
plt.title("Absolute velocity ($\sqrt{v_x^2 + v_y^2}$)")
plt.xlabel("x")
plt.show()

## Stochastic Gradient Descent with Nesterov Momentum

In [None]:
class SGD_nestrov_momentum():
    def __init__(self, lr=0.001, beta=0.9, x=None, y=None):
        self.lr = lr
        if x == None or y == None:
            self.x = np.random.rand()*10-5
            self.y = np.random.rand()*10-5
        else:
            self.x = x
            self.y = y
        self.beta = beta
        self.vx = None
        self.vy = None
        
    def step(self, lr = None, beta=None):
        f = ff(self.x, self.y)
        dx = df_dx(self.x, self.y)
        dy = df_dy(self.x, self.y)
        
        if type(lr) == type(None):
            lr = self.lr
        if type(beta) == type(None):
            beta = self.beta
        if type(self.vx) == type(None) or type(self.vy)==type(None):
            self.vx = lr * dx
            self.vy = lr * dy
        else:
            dx_in_vx = df_dx(self.x-self.vx, self.y-self.vy)
            dy_in_vy = df_dy(self.x-self.vx, self.y-self.vy)
            self.vx = beta * self.vx + lr * dx_in_vx
            self.vy = beta * self.vy + lr * dy_in_vy
        self.x += - self.vx
        self.y += - self.vy
            
        return [self.x, self.y, dx, dy, self.vx, self.vy]


In [None]:
np.random.seed(655324)
xs,ys,vel_xs, vel_ys, dxs,dys= [x_start],[y_start],[],[],[],[]

opt = SGD_nestrov_momentum(x=x_start, y=y_start)
errors=[ff(x_start, y_start)]
for epochs in range(iterations):
    x, y, dx, dy, vel_x, vel_y= opt.step(lr=learning_rate, beta=.9)
    vel_xs.append(vel_x)
    vel_ys.append(vel_y)
    xs.append(x)
    ys.append(y)
    dxs.append(dx)
    dys.append(dy)
    errors.append(ff(x,y))
plt.figure(figsize=[18,6])
plt.plot(errors)
plt.title("Error evolution over time. Minimum error obtained in {0} iterations: {1}".format(len(errors), min(errors)))
plt.xlabel("time (iterations)")
plt.ylabel("error")
plt.show()

errors_nestrov_momentum = errors
xs_nestrov_momentum = xs
ys_nestrov_momentum = ys

### Spatial evolution

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(xs[0:100])
plt.title("X parameter evolution")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(ys[0:100])
plt.title("Y parameter evolution")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(xs[0:100], ys[0:100])
plt.title("x/y evolution")
plt.xlabel("x")
plt.show()

### Dynamic evolution

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(np.abs(vel_xs[0:100]))
plt.title("X parameter velocity ($v_x$)")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(np.abs(vel_ys[0:100]))
plt.title("Y parameter velocity ($v_y$)")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(np.sqrt(np.array(vel_xs[0:100])**2 + np.array(vel_ys[0:100])**2))
plt.title("Absolute velocity ($\sqrt{v_x^2 + v_y^2}$)")
plt.xlabel("x")
plt.show()

# Adagrad

In [None]:
class AdaGrad():
    def __init__(self, lr=0.001, x=None, y=None):
        self.lr = lr
        if x==None or y==None:
            self.x = np.random.rand()*10-5
            self.y = np.random.rand()*10-5
        else:
            self.x = x
            self.y = y
        self.sumsq_dx = 0
        self.sumsq_dy = 0
    def step(self, lr = None):
        epsilon = 1e-8
        if not lr:
            lr = self.lr
        # derivative
        f = ff(self.x, self.y)
        dx = df_dx(self.x, self.y)
        dy = df_dy(self.x, self.y)
        self.sumsq_dx += dx**2
        self.sumsq_dy += dy**2
        self.x = self.x - (lr/(epsilon + np.sqrt(self.sumsq_dx)))*dx
        self.y = self.y - (lr/(epsilon+np.sqrt(self.sumsq_dy+epsilon)))*dy
        
        return [self.x, self.y, dx, dy]
        

In [None]:
np.random.seed(655324)
xs,ys,vel_xs, vel_ys, dxs,dys= [x_start],[y_start],[],[],[],[]

opt = AdaGrad(x=x_start, y=y_start)
errors=[ff(x_start, y_start)]
for epochs in range(iterations):
    x, y, dx, dy = opt.step(lr=learning_rate)
    vel_xs.append(vel_x)
    vel_ys.append(vel_y)
    xs.append(x)
    ys.append(y)
    dxs.append(dx)
    dys.append(dy)
    errors.append(ff(x,y))
plt.figure(figsize=[18,6])
plt.plot(errors)
plt.title("Error evolution over time. Minimum error obtained in {0} iterations: {1}".format(len(errors), min(errors)))
plt.xlabel("time (iterations)")
plt.ylabel("error")
plt.show()

errors_adagrad = errors
xs_adagrad = xs
ys_adagrad = ys

### Spatial evolution

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(xs[0:100])
plt.title("X parameter evolution")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(ys[0:100])
plt.title("Y parameter evolution")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(xs[0:100], ys[0:100])
plt.title("x/y evolution")
plt.xlabel("x")
plt.show()

### Dynamic evolution

In [None]:
vel_xs=np.diff(xs)
vel_ys=np.diff(ys)
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(np.abs(vel_xs[0:100]))
plt.title("X parameter velocity (momentum; $v_x$)")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(np.abs(vel_ys[0:100]))
plt.title("Y parameter velocity (momentum; $v_y$)")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(np.sqrt(np.array(vel_xs[0:100])**2 + np.array(vel_ys[0:100])**2))
plt.title("Absolute velocity ($\sqrt{v_x^2 + v_y^2}$)")
plt.xlabel("x")
plt.show()

# RMSprop

In [None]:
class RMSProp():
    def __init__(self, lr=0.001, decay_rate=0.9, x=None, y=None):
        self.lr = lr
        self.decay_rate = decay_rate
        if x==None or y==None:
            self.x = np.random.rand()*10-5
            self.y = np.random.rand()*10-5
        else:
            self.x = x
            self.y = y
        self.decay_x = 0
        self.decay_y = 0
    def step(self, lr=None, decay_rate=None):
        epsilon = 1e-8
        if not lr:
            lr = self.lr
        if not decay_rate:
            decay_rate = self.decay_rate
        # derivative
        f = ff(self.x, self.y)
        dx = df_dx(self.x, self.y)
        dy = df_dy(self.x, self.y)
        self.decay_x = decay_rate * (self.decay_x) + (1-decay_rate)*dx**2
        self.decay_y = decay_rate * (self.decay_y) + (1-decay_rate)*dy**2
        self.x = self.x - (lr/(epsilon + np.sqrt(self.decay_x)))*dx
        self.y = self.y - (lr/(epsilon + np.sqrt(self.decay_y)))*dy
        
        return [self.x, self.y, dx, dy]
        

In [None]:
np.random.seed(655324)
xs,ys,vel_xs, vel_ys, dxs,dys= [x_start],[y_start],[],[],[],[]

opt = RMSProp(x=x_start, y=y_start)
errors=[ff(x_start, y_start)]
for epochs in range(iterations):
    x, y, dx, dy = opt.step(lr=learning_rate, decay_rate=0.99)
    vel_xs.append(vel_x)
    vel_ys.append(vel_y)
    xs.append(x)
    ys.append(y)
    dxs.append(dx)
    dys.append(dy)
    errors.append(ff(x,y))
plt.figure(figsize=[18,6])
plt.plot(errors)
plt.title("Error evolution over time. Minimum error obtained in {0} iterations: {1}".format(len(errors), min(errors)))
plt.xlabel("time (iterations)")
plt.ylabel("error")
plt.show()

errors_rmsprop = errors
xs_rmsprop = xs
ys_rmsprop = ys

### Spatial evolution

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(xs[0:100])
plt.title("X parameter evolution")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(ys[0:100])
plt.title("Y parameter evolution")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(xs[0:100], ys[0:100])
plt.title("x/y evolution")
plt.xlabel("x")
plt.show()

### Dynamic evolution

In [None]:
vel_xs=np.diff(xs)
vel_ys=np.diff(ys)
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(np.abs(vel_xs[0:100]))
plt.title("X parameter velocity (momentum; $v_x$)")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(np.abs(vel_ys[0:100]))
plt.title("Y parameter velocity (momentum; $v_y$)")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(np.sqrt(np.array(vel_xs[0:100])**2 + np.array(vel_ys[0:100])**2))
plt.title("Absolute velocity ($\sqrt{v_x^2 + v_y^2}$)")
plt.xlabel("x")
plt.show()

# AdaDelta

In [None]:
class AdaDelta():
    def __init__(self, lr=0.001, decay_rate=0.9, x=None, y=None):
        self.lr = lr
        self.decay_rate = decay_rate
        if x==None or y==None:
            self.x = np.random.rand()*10-5
            self.y = np.random.rand()*10-5
        else:
            self.x = x
            self.y = y
        self.decay_x = 0
        self.decay_y = 0
        self.decay_dx = 1
        self.decay_dy = 1
        
    def step(self, lr=None, decay_rate=None):
        epsilon = 1e-8
        if not lr:
            lr = self.lr
        if not decay_rate:
            decay_rate = self.decay_rate
        # derivative
        f = ff(self.x, self.y)
        dx = df_dx(self.x, self.y)
        dy = df_dy(self.x, self.y)
        # Update decays
        self.decay_x = decay_rate * (self.decay_x) + (1-decay_rate)*dx**2
        self.decay_y = decay_rate * (self.decay_y) + (1-decay_rate)*dy**2
        
        update_x = dx*((epsilon + np.sqrt(self.decay_dx))/(epsilon + np.sqrt(self.decay_x)))
        update_y = dy*((epsilon + np.sqrt(self.decay_dy))/(epsilon + np.sqrt(self.decay_y)))
        
        self.x = self.x - (lr*update_x)
        self.y = self.y - (lr*update_y)
        
        # Update decays d
        self.decay_dx = decay_rate * (self.decay_dx) + (1-decay_rate)*update_x**2
        self.decay_dy = decay_rate * (self.decay_dy) + (1-decay_rate)*update_y**2
        
        return [self.x, self.y, dx, dy]
        
        

In [None]:
np.random.seed(655324)
xs,ys,vel_xs, vel_ys, dxs,dys= [x_start],[y_start],[],[],[],[]

opt = AdaDelta(x=x_start, y=y_start)
errors=[ff(x_start, y_start)]
for epochs in range(iterations):
    x, y, dx, dy = opt.step(lr=learning_rate, decay_rate=0.99)
    vel_xs.append(vel_x)
    vel_ys.append(vel_y)
    xs.append(x)
    ys.append(y)
    dxs.append(dx)
    dys.append(dy)
    errors.append(ff(x,y))
plt.figure(figsize=[18,6])
plt.plot(errors)
plt.title("Error evolution over time. Minimum error obtained in {0} iterations: {1}".format(len(errors), min(errors)))
plt.xlabel("time (iterations)")
plt.ylabel("error")
plt.show()

errors_adadelta = errors
xs_adadelta = xs
ys_adadelta = ys

### Spatial evolution

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(xs[0:100])
plt.title("X parameter evolution")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(ys[0:100])
plt.title("Y parameter evolution")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(xs[0:100], ys[0:100])
plt.title("x/y evolution")
plt.xlabel("x")
plt.show()

### Dynamic evolution

In [None]:
vel_xs=np.diff(xs)
vel_ys=np.diff(ys)
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(np.abs(vel_xs[0:100]))
plt.title("X parameter velocity (momentum; $v_x$)")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(np.abs(vel_ys[0:100]))
plt.title("Y parameter velocity (momentum; $v_y$)")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(np.sqrt(np.array(vel_xs[0:100])**2 + np.array(vel_ys[0:100])**2))
plt.title("Absolute velocity ($\sqrt{v_x^2 + v_y^2}$)")
plt.xlabel("x")
plt.show()

# ADAM

In [None]:
class ADAM():
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, x=None, y=None):
        self.lr = lr
        if x==None or y==None:
            self.x = np.random.rand()*10-5
            self.y = np.random.rand()*10-5
        else:
            self.x = x
            self.y = y
        self.m_x = 0
        self.m_y = 0
        self.v_x = 0
        self.v_y = 0
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.t = 0
        
    def step(self, lr = None):
        self.t+=1
        epsilon = 1e-8
        if not lr:
            lr = self.lr
        # derivative
        f = ff(self.x, self.y)
        dx = df_dx(self.x, self.y)
        dy = df_dy(self.x, self.y)
        self.m_x = self.beta_1*self.m_x + (1-self.beta_1)*dx
        self.m_y = self.beta_1*self.m_y + (1-self.beta_1)*dy
        self.v_x = self.beta_2*self.v_x + (1-self.beta_2)*dx**2
        self.v_y = self.beta_2*self.v_y + (1-self.beta_2)*dy**2
        m_x_hat = self.m_x/(1-self.beta_1**self.t)
        m_y_hat = self.m_y/(1-self.beta_1**self.t)
        v_x_hat = self.v_x/(1-self.beta_2**self.t)
        v_y_hat = self.v_y/(1-self.beta_2**self.t)
        
        
        self.x = (self.x - lr*m_x_hat)/(np.sqrt(v_x_hat)+epsilon)
        self.y = (self.y - lr*m_y_hat)/(np.sqrt(v_y_hat)+epsilon)
        return [self.x, self.y, dx, dy]
        

In [None]:
np.random.seed(655324)
opt = ADAM(x=x_start, y=y_start)
errors=[ff(x_start, y_start)]
xs,ys, dxs,dys= [x_start],[y_start],[],[]
sns.set_context("talk")
for epochs in range(iterations):
    x, y, dx, dy = opt.step(lr=learning_rate)
    xs.append(x)
    ys.append(y)
    dxs.append(dx)
    dys.append(dy)
    errors.append(ff(x,y))
plt.figure(figsize=[18,6])
plt.plot(errors)
plt.title("Error evolution over time. Minimum error obtained in {0} iterations: {1}".format(len(errors), min(errors)))
plt.xlabel("time (iterations)")
plt.ylabel("error")
plt.show()

errors_adam = errors
xs_adam = xs
ys_adam = ys

### Spatial evolution

In [None]:
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(xs)
plt.title("X parameter evolution")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(ys)
plt.title("Y parameter evolution")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(xs, ys)
plt.title("x/y evolution")
plt.xlabel("x")
plt.show()

### Dynamic evolution

In [None]:
vel_xs=np.diff(xs)
vel_ys=np.diff(ys)
plt.figure(figsize=[18,6])
plt.subplot(131)
plt.plot(np.abs(vel_xs[0:100]))
plt.title("X parameter velocity (momentum; $v_x$)")
plt.xlabel("iterations")
plt.subplot(132)
plt.plot(np.abs(vel_ys[0:100]))
plt.title("Y parameter velocity (momentum; $v_y$)")
plt.xlabel("iterations")
plt.subplot(133)
plt.plot(np.sqrt(np.array(vel_xs[0:100])**2 + np.array(vel_ys[0:100])**2))
plt.title("Absolute velocity ($\sqrt{v_x^2 + v_y^2}$)")
plt.xlabel("x")
plt.show()

# Global comparison

In [None]:
plt.figure(figsize=[18,6])
plt.plot(errors_sgd)
plt.plot(errors_momentum)
plt.plot(errors_nestrov_momentum)
plt.plot(errors_adagrad)
plt.plot(errors_rmsprop)
plt.plot(errors_adadelta)
plt.plot(errors_adam, color="k")
plt.title("Error comparison among the optimizers shown above")
plt.ylabel("error")
plt.xlabel("time (iterations)")
plt.legend(labels=["SGD", "SGD + Momentum", "SGD + Nesterov Momentum", "AdaGrad", "RMSProp", "AdaDelta", "ADAM"])
plt.show()

In [None]:
fig = plt.figure(figsize=[20,12])
ax = fig.gca(projection='3d')
x = y = np.arange(-5.0, 5.0000001, 0.1)
y = np.arange(-60, 2, 0.1)
X, Y = np.meshgrid(x, y)
zs = np.array([ff(x,y) for x,y in zip(np.ravel(X), np.ravel(Y))])
Z = zs.reshape(X.shape)
Gx, Gy = np.gradient(Z) # gradients with respect to x and y
G = (Gx**2.0+Gy**2.0)**.5  # gradient magnitude
N = G/G.max()  # normalize 0..1

ax.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm, antialiased=False, shade=False, 
                rstride=5, cstride=1, linewidth=0)
ax.view_init(elev=40., azim=80)
#ax.set_aspect(0.3)

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
ax.plot3D(xs=xs_sgd, ys=ys_sgd, zs=errors_sgd, color='b')
ax.plot3D(xs=xs_momentum, ys=ys_momentum, zs=errors_momentum, color='g')
ax.plot3D(xs=xs_nestrov_momentum, ys=ys_nestrov_momentum, zs=errors_nestrov_momentum, color='r')
ax.plot3D(xs=xs_rmsprop, ys=ys_rmsprop, zs=errors_rmsprop, color='y')
ax.plot3D(xs=xs_adadelta, ys=ys_adadelta, zs=errors_adadelta, color='turquoise')
ax.plot3D(xs=xs_adagrad, ys=ys_adagrad, zs=errors_adagrad, color='purple')
ax.plot3D(xs=xs_adam, ys=ys_adam, zs=errors_adam, color='k')
plt.legend(labels=["SGD", "SGD + Momentum", "SGD + Nesterov Momentum", "AdaGrad", "RMSProp", "AdaDelta", "ADAM"])

plt.show()