## Семинар 7: "Методы оптимизации"

ФИО:

In [1]:
import numpy as np
from torch import functional as F
from torch.autograd import Variable
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader
%matplotlib notebook
import matplotlib.pyplot as plt
from IPython.display import clear_output
from mpl_toolkits.mplot3d import Axes3D

На этом семинаре мы попробуем сравнить различные методы оптимизации: GD, Momentum, NAG, Adagrad, Adadelta, Adam.

### Часть 1: Реализация методов

Полезная функция: plt.contour
Для всех экспериментов подберите параметры так, чтобы метод сошелся к ближайшему локальному минимуму. Все методы следует запускать из одной и той же точки.

<i> 1.1 Реализуйте методы GD, Momentum, NAG, Adagrad, Adadelta, Adam.</i>

In [33]:
class Optimizer():
    def __init__(self, variables):
        self.variables = variables
        
    def step(self, loss):
        pass
    
class GD(Optimizer):
    def __init__(self, variables, learning_rate):
        super().__init__(variables)
        self.lr = learning_rate
        
    def step(self, J):
        loss = J()
        loss.backward()
        
        with torch.no_grad():
            for v in self.variables:
                v -= self.lr * v.grad
                
                v.grad.data.zero_()
                
class Momentum(Optimizer):
    def __init__(self, variables, learning_rate, gamma=0.9):
        super().__init__(variables)
        self.lr = learning_rate
        self.g = gamma
        
        self.moments = []        
        for v in self.variables:
            self.moments.append(torch.zeros_like(v.data))
        
    def step(self, J):
        loss = J()
        loss.backward()
        
        with torch.no_grad():
            for v, moment in zip(self.variables, self.moments):
                moment *= self.g
                moment += self.lr * v.grad
                
                v -= moment
                
                v.grad.data.zero_()
                
class NAG(Optimizer):
    def __init__(self, variables, learning_rate, gamma=0.9):
        super().__init__(variables)
        self.lr = learning_rate
        self.g = gamma
        
        self.moments = []        
        for v in self.variables:
            self.moments.append(torch.zeros_like(v.data))
        
    def step(self, J):
        with torch.no_grad():
            for v, moment in zip(self.variables, self.moments):
                moment *= self.g                
                v -= moment
        
        loss = J()
        loss.backward()
        
        with torch.no_grad():
            for v, moment in zip(self.variables, self.moments):
                v += moment
                
                moment += self.lr * v.grad
                
                v -= moment
                
                v.grad.data.zero_()
                
class Adagrad(Optimizer):
    def __init__(self, variables, learning_rate, eps=10**-8):
        super().__init__(variables)
        self.lr = learning_rate
        self.eps = eps
        
        self.g_sq = []
        for v in self.variables:
            self.g_sq.append(torch.zeros_like(v.data))
        
    def step(self, J):
        loss = J()
        loss.backward()
        
        with torch.no_grad():
            for v, g_sq in zip(self.variables, self.g_sq):      
                g_sq += v.grad**2
                v -= self.lr / (g_sq + self.eps).sqrt() * v.grad
                
                v.grad.data.zero_()

                
                
class Adadelta(Optimizer):
    def __init__(self, variables, eps=10**-8, gamma=0.9):
        super().__init__(variables)
        self.eps = eps
        self.gamma=gamma
        self.E_g=0
        self.E_dr=0
        self.g_sq = []
        for v in self.variables:
            self.g_sq.append(torch.zeros_like(v.data))
        
    def step(self, J):
        loss = J()
        loss.backward()
        
        with torch.no_grad():
            for v, g_sq in zip(self.variables, self.g_sq):      
                g_sq += v.grad**2
                self.E_g=self.gamma*self.E_g+(1-self.gamma)*(g_sq)
                v -= ((self.E_dr+self.eps) / (self.E_g + self.eps)).sqrt() * v.grad
                self.E_dr=self.gamma*self.E_dr+(1-self.gamma)*(v.grad**2)
                v.grad.data.zero_()
                
                
class Adam(Optimizer):
    def __init__(self, variables, learning_rate, eps=10**-8, b1=0.9, b2=0.999):
        super().__init__(variables)
        self.lr = learning_rate
        self.eps = eps
        self.b1=b1
        self.b2=b2
        self.m=0
        self.nu=0
        self.i=0
        self.g_sq = []
        for v in self.variables:
            self.g_sq.append(torch.zeros_like(v.data))
        
    def step(self,J,prediction,y):
        loss = J(prediction,y)
        loss.backward()
        
        self.i+=1
        with torch.no_grad():
            for v, g_sq in zip(self.variables, self.g_sq):      
                g_sq += v.grad**2
                self.m=self.b1*self.m+(1-self.b1)*v.grad
                m=self.m/(1-self.b1**self.i)
                self.nu=self.b2*self.nu+(1-self.b2)*(g_sq)
                nu=self.nu/(1-self.b2**self.i)
                v -= self.lr / (np.sqrt(nu + self.eps))* m
                
                v.grad.data.zero_()

<i> 1.2 Сравните эти методы на функции $J(x, y) = x^2+y^2$</i>

In [3]:
def apply_opt(optim, J, n_iters=1000):    
    points = []

    for i in range(n_iters):
        points.append((X.item(), Y.item(), J().item()))
        optim.step(J)

    points = np.array(points)
    return points

In [4]:
optims = [lambda v: GD(v, 0.1), lambda v: Momentum(v, 0.1),
          lambda v: NAG(v, 0.1), lambda v: Adagrad(v, 0.1),
         lambda v: Adadelta(v), lambda v: Adam(v,0.1)]

def J(X, Y):
    return X**2 + Y**2
    

fig = plt.figure()
ax = fig.add_subplot(111)

R = 10
X = np.linspace(-R, R, 1000)
Y = np.linspace(-R, R, 1000)

ax.set_xlim(-R, +R)
ax.set_ylim(-R, +R)

xx, yy = np.meshgrid(X, Y)

xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)

Z = J(xx, yy)
#ax.plot(xx.numpy().flatten(), yy.numpy().flatten(), Z.numpy().flatten())
#ax.plot_surface(xx.numpy(), yy.numpy(), Z.numpy())
con=ax.contour(xx.numpy(), yy.numpy(), Z.numpy())
plt.clabel(con)
names=["GD","Momentum", "NAG", "Adagrad", "Adadelta", "Adam"]
for optim,name  in zip(optims,names):
    X = Variable(torch.tensor(float(8)), requires_grad=True)
    Y = Variable(torch.tensor(float(8)), requires_grad=True)    
    points = apply_opt(optim([X, Y]), lambda X=X, Y=Y: J(X, Y), 500)

    ax.plot(points[:, 0], points[:, 1],label=name)

plt.legend()

plt.show()

<IPython.core.display.Javascript object>

In [5]:
optims = [lambda v: GD(v, 0.1), lambda v: Momentum(v, 0.1),
          lambda v: NAG(v, 0.1), lambda v: Adagrad(v, 0.1),
         lambda v: Adadelta(v), lambda v: Adam(v,0.1)]

def J(X, Y):
    return X**2/6 + Y**2/6
    #return X**2/10 * X.sin() - Y**2/10 * Y.cos()

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

R = 10
X = np.linspace(-R, R, 10)
Y = np.linspace(-R, R, 10)

ax.set_xlim3d(-R, +R)
ax.set_ylim3d(-R, +R)
ax.set_zlim3d(-R, +R)

xx, yy = np.meshgrid(X, Y)

xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)

Z = J(xx, yy)

#ax.plot(xx.numpy().flatten(), yy.numpy().flatten(), Z.numpy().flatten())
#ax.plot_surface(xx.numpy(), yy.numpy(), Z.numpy())
ax.plot_wireframe(xx.numpy(), yy.numpy(), Z.numpy())
names=["GD","Momentum", "NAG", "Adagrad", "Adadelta", "Adam"]
for optim,name  in zip(optims,names):
    X = Variable(torch.tensor(float(8)), requires_grad=True)
    Y = Variable(torch.tensor(float(8)), requires_grad=True)    
    points = apply_opt(optim([X, Y]), lambda X=X, Y=Y: J(X, Y), 500)

    ax.plot(points[:, 0], points[:, 1], points[:, 2],label=name)
plt.legend()
#plt.axis('equal')
plt.show()

<IPython.core.display.Javascript object>

In [6]:
optims =[ lambda v: Adadelta(v)]

def J(X, Y):
    return X**2/6 + Y**2/6
    #return X**2/10 * X.sin() - Y**2/10 * Y.cos()

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

R = 10
X = np.linspace(-R, R, 10)
Y = np.linspace(-R, R, 10)

ax.set_xlim3d(-R, +R)
ax.set_ylim3d(-R, +R)
ax.set_zlim3d(-R, +R)

xx, yy = np.meshgrid(X, Y)

xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)

Z = J(xx, yy)

#ax.plot(xx.numpy().flatten(), yy.numpy().flatten(), Z.numpy().flatten())
#ax.plot_surface(xx.numpy(), yy.numpy(), Z.numpy())
ax.plot_wireframe(xx.numpy(), yy.numpy(), Z.numpy())
names=["Adadelta"]
for optim,name  in zip(optims,names):
    X = Variable(torch.tensor(float(8)), requires_grad=True)
    Y = Variable(torch.tensor(float(8)), requires_grad=True)    
    points = apply_opt(optim([X, Y]), lambda X=X, Y=Y: J(X, Y), 500)

    ax.plot(points[:, 0], points[:, 1], points[:, 2],label=name,c='red')
plt.legend()
#plt.axis('equal')
plt.show()

<IPython.core.display.Javascript object>

<i>1.3 Сравните эти методы на функции $J(x, y) = x^2sin(x)+y^2sin(y)$</i>

In [7]:
optims = [lambda v: GD(v, 0.1), lambda v: Momentum(v, 0.1),
          lambda v: NAG(v, 0.1), lambda v: Adagrad(v, 0.1),
         lambda v: Adadelta(v), lambda v: Adam(v,0.1)]

def J(X, Y):
    return X**2*X.sin()/6 + Y**2*Y.sin()/6
    

fig = plt.figure()
ax = fig.add_subplot(111)

R = 10
X = np.linspace(-R, R, 1000)
Y = np.linspace(-R, R, 1000)

ax.set_xlim(-R, +R)
ax.set_ylim(-R, +R)

xx, yy = np.meshgrid(X, Y)

xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)

Z = J(xx, yy)
#ax.plot(xx.numpy().flatten(), yy.numpy().flatten(), Z.numpy().flatten())
#ax.plot_surface(xx.numpy(), yy.numpy(), Z.numpy())
con=ax.contour(xx.numpy(), yy.numpy(), Z.numpy())
plt.clabel(con)
names=["GD","Momentum", "NAG", "Adagrad", "Adadelta", "Adam"]
for optim,name  in zip(optims,names):
    X = Variable(torch.tensor(float(8)), requires_grad=True)
    Y = Variable(torch.tensor(float(8)), requires_grad=True)    
    points = apply_opt(optim([X, Y]), lambda X=X, Y=Y: J(X, Y), 500)

    ax.plot(points[:, 0], points[:, 1],label=name)

plt.legend()

plt.show()

<IPython.core.display.Javascript object>

In [8]:
optims = [lambda v: GD(v, 0.1), lambda v: Momentum(v, 0.1),
          lambda v: NAG(v, 0.1), lambda v: Adagrad(v, 0.1),
         lambda v: Adadelta(v), lambda v: Adam(v,0.1)]

def J(X, Y):
    return X**2*X.sin()/6 + Y**2*Y.sin()/6

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

R = 10
X = np.linspace(-R, R, 10)
Y = np.linspace(-R, R, 10)

ax.set_xlim3d(-R, +R)
ax.set_ylim3d(-R, +R)
ax.set_zlim3d(-R, +R)

xx, yy = np.meshgrid(X, Y)

xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)

Z = J(xx, yy)

#ax.plot(xx.numpy().flatten(), yy.numpy().flatten(), Z.numpy().flatten())
#ax.plot_surface(xx.numpy(), yy.numpy(), Z.numpy())
ax.plot_wireframe(xx.numpy(), yy.numpy(), Z.numpy())
names=["GD","Momentum", "NAG", "Adagrad", "Adadelta", "Adam"]
for optim,name  in zip(optims,names):
    X = Variable(torch.tensor(float(8)), requires_grad=True)
    Y = Variable(torch.tensor(float(8)), requires_grad=True)    
    points = apply_opt(optim([X, Y]), lambda X=X, Y=Y: J(X, Y), 5000)

    ax.plot(points[:, 0], points[:, 1], points[:, 2],label=name)
plt.legend()
#plt.axis('equal')
plt.show()

<IPython.core.display.Javascript object>

<i>1.3 Сравните эти методы на функции $J(x,y)=x^2sin(x^2)+y^2sin(y^2)$</i>

In [9]:
optims = [lambda v: GD(v, 0.1), lambda v: Momentum(v, 0.1),
          lambda v: NAG(v, 0.1), lambda v: Adagrad(v, 0.1),
         lambda v: Adadelta(v), lambda v: Adam(v,0.1)]

def J(X, Y):
    return X**2*(X**2).sin() + Y**2*(Y**2).sin()
    

fig = plt.figure()
ax = fig.add_subplot(111)

R = 30
X = np.linspace(-R, R, 100)
Y = np.linspace(-R, R, 100)

ax.set_xlim(-R, +R)
ax.set_ylim(-R, +R)

xx, yy = np.meshgrid(X, Y)

xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)

Z = J(xx, yy)
#ax.plot(xx.numpy().flatten(), yy.numpy().flatten(), Z.numpy().flatten())
#ax.plot_surface(xx.numpy(), yy.numpy(), Z.numpy())
con=ax.contour(xx.numpy(), yy.numpy(), Z.numpy())
plt.clabel(con)
names=["GD","Momentum", "NAG", "Adagrad", "Adadelta", "Adam"]
for optim,name  in zip(optims,names):
    X = Variable(torch.tensor(float(8)), requires_grad=True)
    Y = Variable(torch.tensor(float(8)), requires_grad=True)    
    points = apply_opt(optim([X, Y]), lambda X=X, Y=Y: J(X, Y), 500)

    ax.plot(points[:, 0], points[:, 1],label=name)

plt.legend()

plt.show()

<IPython.core.display.Javascript object>

In [10]:
optims = [lambda v: GD(v, 0.1), lambda v: Momentum(v, 0.1),
          lambda v: NAG(v, 0.1), lambda v: Adagrad(v, 0.1),
         lambda v: Adadelta(v), lambda v: Adam(v,0.1)]

def J(X, Y):
    return X**2*(X**2).sin() + Y**2*(Y**2).sin()

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

R = 10
X = np.linspace(-R, R, 10)
Y = np.linspace(-R, R, 10)

ax.set_xlim3d(-R, +R)
ax.set_ylim3d(-R, +R)
ax.set_zlim3d(-R, +R)

xx, yy = np.meshgrid(X, Y)

xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)

Z = J(xx, yy)

#ax.plot(xx.numpy().flatten(), yy.numpy().flatten(), Z.numpy().flatten())
#ax.plot_surface(xx.numpy(), yy.numpy(), Z.numpy())
ax.plot_wireframe(xx.numpy(), yy.numpy(), Z.numpy())
names=["GD","Momentum", "NAG", "Adagrad", "Adadelta", "Adam"]
for optim,name  in zip(optims,names):
    X = Variable(torch.tensor(float(8)), requires_grad=True)
    Y = Variable(torch.tensor(float(8)), requires_grad=True)    
    points = apply_opt(optim([X, Y]), lambda X=X, Y=Y: J(X, Y), 5000)

    ax.plot(points[:, 0], points[:, 1], points[:, 2],label=name)
plt.legend()
#plt.axis('equal')
plt.show()

<IPython.core.display.Javascript object>

###  Часть 2: Обучение нейронной сети

<i> 2.1 Сравните графики обучения для полносвязной нейросети на методах Adam, Adagrad, AdaDelta и SGD (на MNIST). </i>

In [None]:
def predict():
   net.forward(x)
   loss = ...
   return loss


In [11]:
transform = transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                    ])
train_dataset = MNIST('.', train=True, download=True, transform=transform)
test_dataset = MNIST('.', train=False, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [35]:
def train(network, epochs, learning_rate, optim, ravel_init=True):
    loss = nn.NLLLoss()
    optimizer = optim(network.parameters(), learning_rate)
    train_loss_epochs = []
    test_loss_epochs = []
    train_accuracy_epochs = []
    test_accuracy_epochs = []
    try:
        for epoch in range(epochs):
            losses = []
            accuracies = []
            for X, y in train_loader:
                if ravel_init:
                    X = X.view(X.size(0), -1)
                prediction = network(X)
                network.forward(X)
                loss_batch = loss(prediction, y)
                losses.append(loss_batch.item())
                optimizer.step(loss, prediction=prediction,y=y)
                accuracies.append((np.argmax(prediction.data.numpy(), 1)==y.data.numpy()).mean())
            train_loss_epochs.append(np.mean(losses))
            train_accuracy_epochs.append(np.mean(accuracies))
            losses = []
            accuracies = []    
            for X, y in test_loader:
                if ravel_init:
                    X = X.view(X.size(0), -1)
                prediction = network(X)
                loss_batch = loss(prediction, y)
                losses.append(loss_batch.item())
                accuracies.append((np.argmax(prediction.data.numpy(), 1)==y.data.numpy()).mean())
            test_loss_epochs.append(np.mean(losses))
            test_accuracy_epochs.append(np.mean(accuracies))
            print('\rEpoch {0}... (Train/Test) NLL: {1:.3f}/{2:.3f}\tAccuracy: {3:.3f}/{4:.3f}'.format(
                        epoch, train_loss_epochs[-1], test_loss_epochs[-1],
                        train_accuracy_epochs[-1], test_accuracy_epochs[-1]))
            plt.figure(figsize=(12, 5))
            plt.subplot(1, 2, 1)
            plt.plot(train_loss_epochs, label='Train')
            plt.plot(test_loss_epochs, label='Test')
            plt.xlabel('Epochs', fontsize=16)
            plt.ylabel('Loss', fontsize=16)
            plt.legend(loc=0, fontsize=16)
            plt.grid()
            plt.subplot(1, 2, 2)
            plt.plot(train_accuracy_epochs, label='Train accuracy')
            plt.plot(test_accuracy_epochs, label='Test accuracy')
            plt.xlabel('Epochs', fontsize=16)
            plt.ylabel('Loss', fontsize=16)
            plt.legend(loc=0, fontsize=16)
            plt.grid()
            plt.show()
    except KeyboardInterrupt:
        pass

In [36]:
network = nn.Sequential(nn.Linear(784, 100),
                        nn.ReLU(),
                        nn.Linear(100, 100),
                        nn.ReLU(),
                        nn.Linear(100, 32),
                        nn.LogSoftmax())

train(network, 10, 0.001, optim=Adam)

  input = module(input)


Epoch 0... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 1... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 2... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 3... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 4... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 5... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 6... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.012


<IPython.core.display.Javascript object>

Epoch 7... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 8... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

Epoch 9... (Train/Test) NLL: 3.481/3.480	Accuracy: 0.010/0.011


<IPython.core.display.Javascript object>

In [37]:
network = nn.Sequential(nn.Linear(784, 100),
                        nn.ReLU(),
                        nn.Linear(100, 100),
                        nn.ReLU(),
                        nn.Linear(100, 10),
                        nn.LogSoftmax())

train(network, 10, 0.001, optim=torch.optim.Adagrad)

Epoch 9... (Train/Test) NLL: 0.230/0.226	Accuracy: 0.934/0.935


<IPython.core.display.Javascript object>

In [38]:
network = nn.Sequential(nn.Linear(784, 100),
                        nn.ReLU(),
                        nn.Linear(100, 100),
                        nn.ReLU(),
                        nn.Linear(100, 10),
                        nn.LogSoftmax())

train(network, 10, 0.001, optim=torch.optim.Adadelta)

Epoch 9... (Train/Test) NLL: 0.514/0.478	Accuracy: 0.872/0.884


<IPython.core.display.Javascript object>

<i> 2.2 Сравните графики обучения для сверточной нейросети на методах Adam, Adagrad, AdaDelta и SGD. </i>

In [30]:
image_size = 28
channels = 1
class ConvClassifier(nn.Module):
    def __init__(self, image_size):
        super(ConvClassifier, self).__init__()
        self.conv_layers = nn.Sequential(nn.Conv2d(channels, 8, 3, padding=1),
                                         nn.ReLU(),
                                         nn.MaxPool2d(2),
                                         nn.Conv2d(8, 16, 3, padding=1),
                                         nn.ReLU(),)
        self.linear_layers = nn.Sequential(nn.Linear(image_size // 2 * image_size // 2 * 16, 10),
                                           nn.LogSoftmax(dim=1))
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x

In [39]:
network = ConvClassifier(image_size=28)
train(network, 10, 0.001, ravel_init=False, optim=torch.optim.Adam)

Epoch 9... (Train/Test) NLL: 0.010/0.046	Accuracy: 0.997/0.989


<IPython.core.display.Javascript object>

In [34]:
network = ConvClassifier(image_size=28)
train(network, 10, 0.01, ravel_init=False, optim=torch.optim.Adagrad)

Epoch 9... (Train/Test) NLL: 0.031/0.045	Accuracy: 0.991/0.986


<IPython.core.display.Javascript object>

In [35]:
network = ConvClassifier(image_size=28)
train(network, 10, 0.001, ravel_init=False, optim=torch.optim.Adadelta)

Epoch 9... (Train/Test) NLL: 0.412/0.387	Accuracy: 0.880/0.889


<IPython.core.display.Javascript object>

AdaDelta странная штука, пользоваться которой я не буду. Где-то нормально сходится, где то нет.

#### Feedback (опционально)

Здесь вы можете оставить список опечаток из лекции или семинара:

Здесь вы можете оставить комментарии по лекции или семинару: