In [1]:
import torch

In [2]:
# (1) Some Pre-built Options in torch

In [3]:
print(dir(torch.optim))  # torch.optim contains the common optimizers

['ASGD', 'Adadelta', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'LBFGS', 'NAdam', 'Optimizer', 'RAdam', 'RMSprop', 'Rprop', 'SGD', 'SparseAdam', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_functional', '_multi_tensor', 'lr_scheduler', 'swa_utils']


In [4]:
print([_ for _ in dir(torch.nn) if _.endswith("Loss")])  # torch.nn contains the common loss fucntions 

['AdaptiveLogSoftmaxWithLoss', 'BCELoss', 'BCEWithLogitsLoss', 'CTCLoss', 'CosineEmbeddingLoss', 'CrossEntropyLoss', 'GaussianNLLLoss', 'HingeEmbeddingLoss', 'HuberLoss', 'KLDivLoss', 'L1Loss', 'MSELoss', 'MarginRankingLoss', 'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'NLLLoss', 'PoissonNLLLoss', 'SmoothL1Loss', 'SoftMarginLoss', 'TripletMarginLoss', 'TripletMarginWithDistanceLoss']


In [5]:
print([_ for _ in dir(torch.nn.init) if ("uniform" in _ or "normal" in _)]) 
# torch.nn contains the common init functions

['_no_grad_normal_', '_no_grad_trunc_normal_', '_no_grad_uniform_', 'kaiming_normal', 'kaiming_normal_', 'kaiming_uniform', 'kaiming_uniform_', 'normal', 'normal_', 'trunc_normal_', 'uniform', 'uniform_', 'xavier_normal', 'xavier_normal_', 'xavier_uniform', 'xavier_uniform_']


In [6]:
# (2) Details of Training Options

In [7]:
# create a sample network for later sections
import numpy as np

class SampleNetwork(object):
    """ 
    input=2
    tanh_layer input=2, output=4
    tanh_layer input=4, output=1
    """
    def __init__(self,
                 lr=5e-2, 
                 epoch=10,):
        self.network = torch.nn.Sequential(
            torch.nn.Linear(2,4),
            torch.nn.Tanh(),
            torch.nn.Linear(4,1),
            torch.nn.Tanh(),
        ) 
        
        self.epoch = epoch
        
        self.optimizer = None
        self.loss_func = torch.nn.BCEWithLogitsLoss()
        
    @staticmethod
    def make_dataset(sample_count, input_dim=2, label_classes=2):
        x = np.zeros((sample_count, input_dim))
        y = np.zeros((sample_count, 1))
        N = int(sample_count/label_classes)
        for c in range(label_classes):
            ix = range(N*c,N*(c+1))
            t = np.linspace(c*3.12,(c+1)*3.12,N) + np.random.randn(N)*0.2
            r = 4*np.sin(4*t) + np.random.randn(N)*0.2
            x[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
            y[ix] = c
        return torch.Tensor(x), torch.Tensor(y)
    
    
    
    def train_by_step(self, amount=10):
        x_train, y_train = SampleNetwork.make_dataset(amount)
        for e in range(1):
            y_estimate = self.network(x_train)
            loss = self.loss_func(y_estimate, y_train) # attention: do not swap the position!
            loss.backward()
            self.optimizer.step()
            if e%100==0 or e==self.epoch-1:
                print("epoch{}/{}, loss={}".format(e, self.epoch, loss))
            self.optimizer.zero_grad()





In [8]:
s = SampleNetwork()
for _ in s.network.parameters():
    print(_.shape)

torch.Size([4, 2])
torch.Size([4])
torch.Size([1, 4])
torch.Size([1])


In [9]:
# (2.1) Optimizers

In [10]:
# In this section, SGD / MomentumSGD / Nesterov, Adagrad, RMSProp, Adam are introduced with usage and pseudo-code

In [11]:
# (2.1.1)
# SGD / MomentumSGD / Nesterov
s = SampleNetwork()
s.optimizer = torch.optim.SGD(s.network.parameters(), lr=1e-2, momentum=0.9, nesterov=True)

# SGD optimizer internal states
print(s.optimizer.state_dict())

# 'state' comtains the momentum states (velocity)
# 'param_groups' contains lr, momentum factor, nestrov

{'state': {}, 'param_groups': [{'lr': 0.01, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': True, 'params': [0, 1, 2, 3]}]}


In [12]:
s.train_by_step()
print(s.optimizer.state_dict())

epoch0/10, loss=0.6975148916244507
{'state': {0: {'momentum_buffer': tensor([[-0.0024,  0.0248],
        [ 0.0096, -0.0514],
        [-0.0011, -0.0366],
        [ 0.0034, -0.0025]])}, 1: {'momentum_buffer': tensor([-0.0104,  0.0036,  0.0122,  0.0003])}, 2: {'momentum_buffer': tensor([[ 0.0965, -0.1078,  0.1613, -0.0337]])}, 3: {'momentum_buffer': tensor([0.0682])}}, 'param_groups': [{'lr': 0.01, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': True, 'params': [0, 1, 2, 3]}]}


In [13]:
# pesudo-code of sgd/momentumSgd/nestrov updating:

#  (if nestorv: w += m*v)
# v = m*v+lr*grad
# w -= v

def sgd_step(network_parameters):
    optimizer_state_dict = get_state_dict()
    params_of_current_step = optimizer_state_dict['param_groups']
    for layerIndex, layerParam in enumerate(network_parameters):
        state_of_current_layer = optimizer_state_dict['state'][layerIndex]
        
        if params_of_current_step['nesterov']: # apply nestorv temporary update
            layerParam += params_of_current_step['momentum'] * state_of_current_layer['momentum_buffer']
        
        state_of_current_layer['momentum_buffer'] = ( 
            params_of_current_step['momentum'] * state_of_current_layer['momentum_buffer'] + params_of_current_step['lr'] * layerParam.grad.data
        )
        layerParam -= state_of_current_layer['momentum_buffer']



In [14]:
# (2.1.2)
# Adagrad
s = SampleNetwork()
s.optimizer = torch.optim.Adagrad(s.network.parameters(), lr=1e-2)

# adagrad optimizer internal states
print(s.optimizer.state_dict())

# 'state' contains the sum of square of previous grads
# 'param_groups' contains lr, eps

{'state': {0: {'step': 0, 'sum': tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])}, 1: {'step': 0, 'sum': tensor([0., 0., 0., 0.])}, 2: {'step': 0, 'sum': tensor([[0., 0., 0., 0.]])}, 3: {'step': 0, 'sum': tensor([0.])}}, 'param_groups': [{'lr': 0.01, 'lr_decay': 0, 'eps': 1e-10, 'weight_decay': 0, 'initial_accumulator_value': 0, 'params': [0, 1, 2, 3]}]}


In [15]:
s.train_by_step()
print(s.optimizer.state_dict())

epoch0/10, loss=0.6869338154792786
{'state': {0: {'step': 1, 'sum': tensor([[0.0013, 0.0011],
        [0.0007, 0.0004],
        [0.0003, 0.0017],
        [0.0010, 0.0004]])}, 1: {'step': 1, 'sum': tensor([2.8363e-05, 9.4177e-04, 3.8914e-05, 2.6537e-04])}, 2: {'step': 1, 'sum': tensor([[0.0026, 0.0003, 0.0038, 0.0017]])}, 3: {'step': 1, 'sum': tensor([0.0064])}}, 'param_groups': [{'lr': 0.01, 'lr_decay': 0, 'eps': 1e-10, 'weight_decay': 0, 'initial_accumulator_value': 0, 'params': [0, 1, 2, 3]}]}


In [16]:
s.train_by_step()
print(s.optimizer.state_dict())

epoch0/10, loss=0.66921466588974
{'state': {0: {'step': 2, 'sum': tensor([[0.0017, 0.0015],
        [0.0009, 0.0044],
        [0.0003, 0.0023],
        [0.0017, 0.0006]])}, 1: {'step': 2, 'sum': tensor([0.0003, 0.0015, 0.0001, 0.0007])}, 2: {'step': 2, 'sum': tensor([[0.0026, 0.0005, 0.0039, 0.0040]])}, 3: {'step': 2, 'sum': tensor([0.0118])}}, 'param_groups': [{'lr': 0.01, 'lr_decay': 0, 'eps': 1e-10, 'weight_decay': 0, 'initial_accumulator_value': 0, 'params': [0, 1, 2, 3]}]}


In [17]:
# pesudo-code of adagrad updating

# s += grad^2
# w -= grad * lr/sqrt(s+eps)

def adagrad_step(network_parameters):
    optimizer_state_dict = get_state_dict()
    params_of_current_step = optimizer_state_dict['param_groups']
    for layerIndex, layerParam in enumerate(network_parameters):
        state_of_current_layer = optimizer_state_dict['state'][layerIndex]
        state_of_current_layer['step'] += 1
        state_of_current_layer['sum'] += layerParam.grad.data ** 2  
        
        scaled_lr = params_of_current_step['lr'] / torch.sqrt(state_of_current_layer['sum'] + params_of_current_step['eps'])
        layerParam -= scaled_lr * layerParam.grad.data
        
        


In [18]:
# (2.1.3)
# RMSProp
s = SampleNetwork()
s.optimizer = torch.optim.RMSprop(s.network.parameters(), lr=1e-2, alpha=0.9)

# adagrad optimizer internal states
print(s.optimizer.state_dict())

# 'state' contains the Exponential Moving Average (EMA)
# 'param_groups' contains lr, eps, alpha

{'state': {}, 'param_groups': [{'lr': 0.01, 'momentum': 0, 'alpha': 0.9, 'eps': 1e-08, 'centered': False, 'weight_decay': 0, 'params': [0, 1, 2, 3]}]}


In [19]:
s.train_by_step()
print(s.optimizer.state_dict())

epoch0/10, loss=0.6510215997695923
{'state': {0: {'step': 1, 'square_avg': tensor([[4.5999e-05, 2.3468e-05],
        [6.1505e-05, 2.4501e-05],
        [1.3273e-06, 2.5026e-06],
        [1.7386e-06, 9.7919e-07]])}, 1: {'step': 1, 'square_avg': tensor([1.4462e-05, 1.5376e-05, 5.0655e-07, 4.1954e-07])}, 2: {'step': 1, 'square_avg': tensor([[8.0399e-04, 1.5824e-04, 1.6765e-04, 1.5830e-05]])}, 3: {'step': 1, 'square_avg': tensor([0.0007])}}, 'param_groups': [{'lr': 0.01, 'momentum': 0, 'alpha': 0.9, 'eps': 1e-08, 'centered': False, 'weight_decay': 0, 'params': [0, 1, 2, 3]}]}


In [20]:
# pesudo-code of rmsprop updating

# ema = ema*alpha + grad^2*(1-alpha)  # this step is the only difference from Adagrad
# w -= grad * lr/sqrt(ema+eps)

def rmsprop_step(network_parameters):
    optimizer_state_dict = get_state_dict()
    params_of_current_step = optimizer_state_dict['param_groups']
    alpha = params_of_current_step['alpha']
    for layerIndex, layerParam in enumerate(network_parameters):
        state_of_current_layer = optimizer_state_dict['state'][layerIndex]
        state_of_current_layer['step'] += 1
        state_of_current_layer['square_avg'] = alpha * state_of_current_layer['square_avg'] + (1-alpha) * layerParam.grad.data ** 2  
        
        scaled_lr = params_of_current_step['lr'] / torch.sqrt(state_of_current_layer['sum'] + params_of_current_step['eps'])
        layerParam -= scaled_lr * layerParam.grad.data
        
        


In [21]:
# (2.1.4)
# Adam
s = SampleNetwork()
s.optimizer = torch.optim.Adam(s.network.parameters(), lr=1e-2, betas=[0.9, 0.999])

# adagrad optimizer internal states
print(s.optimizer.state_dict())

# 'state' contains the Exponential Moving Average (EMA) and square of EMA
# 'param_groups' contains lr, eps, betas(beta1, beta2)

{'state': {}, 'param_groups': [{'lr': 0.01, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'params': [0, 1, 2, 3]}]}


In [22]:
s.train_by_step()
print(s.optimizer.state_dict())

epoch0/10, loss=0.7266576886177063
{'state': {0: {'step': 1, 'exp_avg': tensor([[-0.0007, -0.0016],
        [-0.0072, -0.0058],
        [-0.0036, -0.0059],
        [ 0.0066,  0.0045]]), 'exp_avg_sq': tensor([[4.3446e-08, 2.4981e-07],
        [5.2516e-06, 3.3817e-06],
        [1.3294e-06, 3.4928e-06],
        [4.3454e-06, 1.9908e-06]])}, 1: {'step': 1, 'exp_avg': tensor([-0.0002, -0.0057, -0.0029,  0.0017]), 'exp_avg_sq': tensor([4.6351e-09, 3.2998e-06, 8.1780e-07, 2.7935e-07])}, 2: {'step': 1, 'exp_avg': tensor([[ 0.0131, -0.0003, -0.0064, -0.0078]]), 'exp_avg_sq': tensor([[1.7275e-05, 6.5885e-09, 4.1488e-06, 6.1445e-06]])}, 3: {'step': 1, 'exp_avg': tensor([-0.0065]), 'exp_avg_sq': tensor([4.2628e-06])}}, 'param_groups': [{'lr': 0.01, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'params': [0, 1, 2, 3]}]}


In [23]:
# pesudo-code of adam updating

# ema = ema*beta1 + grad*(1-beta1)
# ema_sqr = ema_sqr*beta2 + grad**2 * (1-beta2)
# w -= (lr * ema/(1-beta1)) / sqrt(ema_sqr/(1-beta2) + eps)

# Adam is actually momentum + rmsprop

# lr * ema/(1-beta1) = ema*[lr*beta1/(1-beta1)] + grad*lr
# this is the same as momentum step: "v = m*v + lr*grad"

# ema_sqr updating in Adam is the same as RMSProp step : "ema = ema*alpha + grad^2*(1-alpha)""


In [24]:
# sgd -- momentum -----\
#                       ---Adam
# adagrad -- rmsprop --|    

In [25]:
# (2.2) Loss Functions
# In this section, BCE/BECLogit, CE, L1, L2/MSE, SmoothL1 loss are introduced with usage and pesudo-code

In [26]:
import math
single_class_data = (torch.Tensor([.8, .7, .5, .3]), torch.Tensor([1.0, .0, 1.0, .0]))
multi_class_data = (torch.Tensor([[.8, .1, .1], [.2, .5, .3], [.1, .2, .7]]), 
                         torch.Tensor([0, 1, 1]).long())

In [27]:
# (2.2.1) BCE loss (Single Class)
loss = torch.nn.BCELoss(reduction='none') # set reduction='mean' in real use
# BCE(x,y) = -( ylog(x) + (1-y)log(1-x) )
# 当我们进行的是二分类时，即激活函数使用的是sigmoid函数时，常使用交叉熵作为损失函数。这样就能够解决因sigmoid函数导致的梯度消失问题
# (MSE有这个问题， BCE解决了)

print(loss(*single_class_data))

tensor([0.2231, 1.2040, 0.6931, 0.3567])


In [28]:
def bce_loss(x, y):
    return - ( y *  torch.log(x) + (1-y) * torch.log(1-x) )

print(bce_loss(*single_class_data))

tensor([0.2231, 1.2040, 0.6931, 0.3567])


In [29]:
# (2.2.2) BCE with Logit Loss (= sigmoid + BCE)
loss = torch.nn.BCEWithLogitsLoss(reduction='none' , pos_weight=torch.Tensor([0.3, 0.2, 0.4, 0.1]))
# BCELL(x,y) = - (ylog(sigmoid(x)) + (1-y)log(sigmoid(1-x)))
# pos_weight: if real_y==1, loss * pos_weight, else pass
print(loss(*single_class_data))

tensor([0.1113, 1.1032, 0.1896, 0.8544])


In [30]:
# (2.2.3) CE Loss (Multiple Classes) (=NLL + LogSoftmax)
loss = torch.nn.CrossEntropyLoss(reduction='none')
# CE(x,class) = -log( exp(x[class])/sum(exp(x[i]) for i) ) 

print(loss(*multi_class_data))

tensor([0.6897, 0.9398, 1.2679])


In [31]:
def ce_loss(x, y):
    for xi, yi in zip(x,y):
        s = 0
        c = 0
        for j, xij in enumerate(xi):
            s += math.exp(xij)
            if yi==j:
                c = math.exp(xij)
        yield - math.log(c/s)

print([_ for _ in ce_loss(*multi_class_data)])

[0.6897266357726802, 0.939831065520222, 1.267949541610605]


In [32]:
# (2.2.4) NLL Loss (Multiple Classes)
loss = torch.nn.NLLLoss(reduction='none')
# NLLLoss(x,class) = -x[class]

print(loss(*multi_class_data))

tensor([-0.8000, -0.5000, -0.2000])


In [33]:
def nll_loss(x, y):
    for xi, yi in zip(x,y):
        yield -xi[yi]
        
print(list(nll_loss(*multi_class_data)))

[tensor(-0.8000), tensor(-0.5000), tensor(-0.2000)]


In [34]:
# (2.2.5) L1Norm Loss (Regression)
loss = torch.nn.L1Loss(reduction='none')
# L1(x, y)= |x-y|
print(loss(*single_class_data))

tensor([0.2000, 0.7000, 0.5000, 0.3000])


In [35]:
# (2.2.6) L2Norm Loss / MSE (Regression)
loss = torch.nn.MSELoss(reduction='none')
# L2(x, y) = (x-y)**2
print(loss(*single_class_data))

tensor([0.0400, 0.4900, 0.2500, 0.0900])


In [36]:
# (2.2.7) SmoothL1 (Regression)
loss = torch.nn.SmoothL1Loss(reduction='none')
# if |x-y|<1, SmoothL1(x, y)=0.5*L2(x,y), else, = L1(x-y)-0.5
print(loss(*single_class_data))

tensor([0.0200, 0.2450, 0.1250, 0.0450])


In [37]:
# The most usual usage:

# Single class: BCELogit(=sigmoid+BCE)
# Multiple classes: CE(=log+softmax+NLL)