In [None]:
import tensorflow.keras.activations as activations
import tensorflow.keras.models as models
import tensorflow.keras.layers as layers

import numpy as np

from tensorflow.keras.datasets import fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images[:,:,:,np.newaxis]
test_images = test_images[:,:,:,np.newaxis]

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, use_bias=False))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dense(10, activation='softmax'))

model.compile(optimizer='rmsprop', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
history = model.fit(train_images, train_labels, epochs = 5)

model.evaluate(test_images, test_labels)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def f(v):
    x,y = v
    return pow(x,4)/100 + 100*pow(y,2)
    
def gradf(v):
    x,y = v
    return np.array([4*np.pow(x,3)/100, 200*y])

In [None]:
def GradDescent(gradf, v0, n = 100, η = 0.001):
    v = [np.array(v0)]
    for _ in range(n): 
        v.append(v[-1] - η*gradf(v[-1]))
    return v

In [None]:
def Momentum(gradf, v0, n = 100, η = 0.001, α = 0.9, nest_acc = False):
    v = [np.array(v0)]
    m = np.array([0,0])
    for _ in range(n): 
        m = α*m + (1-α)*gradf(v[-1] - nest_acc*η*α/(1-α)*m)
        v.append(v[-1] - η*m)
    return v

In [None]:
def AdaGrad(gradf, v0, n = 100, η = 0.001, α = 0.9):
    ϵ = 1e-5
    v = [np.array(v0)]
    s = 0
    for _ in range(n): 
        s += pow(gradf(v[-1]),2)
        v.append(v[-1] - η*gradf(v[-1])/np.sqrt(s+ϵ))
    return v

In [None]:
def RMSProp(gradf, v0, n = 100, η = 0.001, α = 0.9):
    ϵ = 1e-5
    v = [np.array(v0)]
    s = 0
    for _ in range(n): 
        s = α*s + (1-α)*pow(gradf(v[-1]),2)
        v.append(v[-1] - η*gradf(v[-1])/np.sqrt(s+ϵ))
    return v

In [None]:
def Adam(gradf, v0, n = 100, η = 0.001, α = 0.9):
    ϵ = 1e-5
    v = [np.array(v0)]
    s = 0
    m = np.array([0,0])
    αt = α
    for _ in range(n): 
        m = α*m + (1-α)*gradf(v[-1])
        s = α*s + (1-α)*pow(gradf(v[-1]),2)
        αt *= α
        m̂ = m/(1-αt)
        ŝ = s/(1-αt)
        v.append(v[-1] - η*m̂/np.sqrt(ŝ+ϵ))
    return v

In [None]:
def plot_iterates(v): 
    X = Y = np.linspace(-2,2,100)
    Z = np.array([[f([x,y]) for x in X] for y in Y])
    plt.contourf(X,Y,Z)
    plt.plot(np.vstack(v)[:,0],np.vstack(v)[:,1],'.')
    plt.plot([0],[0],'.')
    plt.xlim(-2,2)
    plt.ylim(-2,2)
    plt.show()
    
plot_iterates(GradDescent(gradf, [1,1], η = 1e-3, n = 10000, α = 0.9))