In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F 


In [12]:
a = torch.arange(12)
a = a.view(3, 4)
a = a.type("torch.DoubleTensor")
print(a)

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]], dtype=torch.float64)


In [13]:
a.dtype

torch.float64

In [14]:
b = F.softmax(a, dim=-1)

In [15]:
b

tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439]], dtype=torch.float64)

In [16]:
b = F.softmax(a, dim=0)
b

tensor([[3.2932e-04, 3.2932e-04, 3.2932e-04, 3.2932e-04],
        [1.7980e-02, 1.7980e-02, 1.7980e-02, 1.7980e-02],
        [9.8169e-01, 9.8169e-01, 9.8169e-01, 9.8169e-01]], dtype=torch.float64)

In [18]:
b = F.softmax(a, dim=1)
b

tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439]], dtype=torch.float64)

In [None]:
import torch
import numpy as np
import torch.nn as nn
from torch.autograd import Variable


def _concat(xs):
  return torch.cat([x.view(-1) for x in xs])


class Architect(object):
  """ 
  Find alpha for constructing the true model's architecture
  """
  def __init__(self, model, args):
    self.network_momentum = args.momentum
    self.network_weight_decay = args.weight_decay
    self.model = model
    self.optimizer = torch.optim.Adam(self.model.arch_parameters(),
        lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)

  def _compute_unrolled_model(self, input, target, eta, network_optimizer):
    loss = self.model._loss(input, target)
    theta = _concat(self.model.parameters()).data
    try:
      moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum)
    except:
      moment = torch.zeros_like(theta)

    dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta
    
    unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta))
    return unrolled_model

#         architect.step(input, target,    input_search, target_search, lr, optimizer, unrolled=args.unrolled)
  def step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled):
    self.optimizer.zero_grad()
    if unrolled:
        self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer)
    else:
        self._backward_step(input_valid, target_valid)
    self.optimizer.step()

  def _backward_step(self, input_valid, target_valid):
    loss = self.model._loss(input_valid, target_valid)
    loss.backward()

  def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer):
    unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer)
    unrolled_loss = unrolled_model._loss(input_valid, target_valid)

    unrolled_loss.backward()
    dalpha = [v.grad for v in unrolled_model.arch_parameters()]
    vector = [v.grad.data for v in unrolled_model.parameters()]
    implicit_grads = self._hessian_vector_product(vector, input_train, target_train)

    for g, ig in zip(dalpha, implicit_grads):
      g.data.sub_(eta, ig.data)

    for v, g in zip(self.model.arch_parameters(), dalpha):
      if v.grad is None:
        v.grad = Variable(g.data)
      else:
        v.grad.data.copy_(g.data)

  def _construct_model_from_theta(self, theta):
    model_new = self.model.new()
    model_dict = self.model.state_dict()

    params, offset = {}, 0
    for k, v in self.model.named_parameters():
      v_length = np.prod(v.size())
      params[k] = theta[offset: offset+v_length].view(v.size())
      offset += v_length

    assert offset == len(theta)
    model_dict.update(params)
    model_new.load_state_dict(model_dict)
    return model_new.cuda()

  def _hessian_vector_product(self, vector, input, target, r=1e-2):
    R = r / _concat(vector).norm()
    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v)
    loss = self.model._loss(input, target)
    grads_p = torch.autograd.grad(loss, self.model.arch_parameters())

    for p, v in zip(self.model.parameters(), vector):
      p.data.sub_(2*R, v)
    loss = self.model._loss(input, target)
    grads_n = torch.autograd.grad(loss, self.model.arch_parameters())

    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v)

    return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]

In [21]:
b = torch.tensor([4., 8.])
c = torch.tensor([2., 10])
a = torch.tensor([1, 3.5])
a.sub(b)
# a

tensor([-3.0000, -4.5000])

In [31]:
x = torch.randn((2,2,3,4))

In [32]:
x

tensor([[[[ 0.2644,  0.7137,  0.5636, -1.2527],
          [-1.7035,  0.6612,  0.4075, -0.4136],
          [-3.2694, -0.3403,  0.7133, -0.8275]],

         [[-0.5982, -0.7934, -0.8426,  0.6049],
          [ 1.1027, -2.1297,  1.1801, -0.6827],
          [-0.5265,  0.7953, -0.7965,  1.5846]]],


        [[[-0.4935,  1.3858,  0.5467,  1.0349],
          [-0.3063,  0.9090, -0.8970,  0.7574],
          [-0.9727, -1.2309,  0.8087, -1.5497]],

         [[ 0.8815,  0.6950, -0.9820,  1.2215],
          [ 0.0970,  1.7433,  0.6720,  1.4634],
          [ 1.3636, -0.2068, -1.2228,  1.0571]]]])

In [33]:
t = torch.cat([i.view(-1) for i in x])

In [35]:
t

tensor([ 0.2644,  0.7137,  0.5636, -1.2527, -1.7035,  0.6612,  0.4075, -0.4136,
        -3.2694, -0.3403,  0.7133, -0.8275, -0.5982, -0.7934, -0.8426,  0.6049,
         1.1027, -2.1297,  1.1801, -0.6827, -0.5265,  0.7953, -0.7965,  1.5846,
        -0.4935,  1.3858,  0.5467,  1.0349, -0.3063,  0.9090, -0.8970,  0.7574,
        -0.9727, -1.2309,  0.8087, -1.5497,  0.8815,  0.6950, -0.9820,  1.2215,
         0.0970,  1.7433,  0.6720,  1.4634,  1.3636, -0.2068, -1.2228,  1.0571])

In [34]:
t.shape

torch.Size([48])

In [37]:
t.sub(0.95, torch.randn(48).view(1,48))

tensor([[ 0.7225,  0.7061,  0.7824, -0.3263, -1.4461,  2.9128,  0.5856, -1.2731,
         -3.8618, -0.0942, -0.5922, -0.6358, -0.9077,  0.0956, -1.4553,  0.3969,
          3.1033, -1.5001, -0.8582, -2.4838, -0.6689, -0.0579, -0.0465,  2.8432,
         -0.4976,  2.4133, -0.4184,  4.0144, -0.3208,  1.2433, -2.0195,  2.3475,
         -2.4071, -2.2237,  2.0568, -1.3999, -0.2721,  0.3594, -1.7325,  1.5531,
          0.2143,  0.2022,  2.4121,  2.5769,  2.6890,  2.1729, -1.3128, -0.0271]])

In [40]:
a = torch.tensor([1, 2, 3, 4, 5, 6, 7]).type("torch.DoubleTensor")
b = torch.tensor([4, 4, 4, 4, 4, 4, 4,]).type("torch.DoubleTensor")


In [41]:
c = a.sub(0.6, b)

In [42]:
c

tensor([-1.4000, -0.4000,  0.6000,  1.6000,  2.6000,  3.6000,  4.6000],
       dtype=torch.float64)

In [None]:
-2  -1  0 1 2 3 4