In [1]:
import math
from collections import OrderedDict
import torch
from torch import Tensor
torch.manual_seed(0)
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fd3380a1470>

In [2]:
class Module(object):
    """ Base class """
    def __init__(self, name):
        self.name = name
        self._parameters = OrderedDict()
        self._children = OrderedDict()
        self.training = True
        
    def __call__(self, *input, **kwargs):
        return self.forward(*input, **kwargs)
        
    def forward(self, *input):
        raise NotImplementedError
        
    def backward(self, *grad_output):
        """ backward receives as input a pointer to a tensor or a tuple of tensors containing
        the gradient of the loss (or the function of interest) wrt the module's output, accumulates
        the gradient wrt the parameters, and returns a tensor or a tuple of tensors containing the 
        gradient of the loss wrt the module's input (Application of the chain rule)"""
        raise NotImplementedError
        
    def add_children(self, module):
        print("adding child = ", module)
        assert isinstance(module, Module) and module is not None, "Not a Module."
        assert module.name not in self._children, "Module {} already exists".format(module.name)
        self._children[module.name] = module
        
    def add_parameter(self, name, param):
        assert isinstance(param, Parameter), "Not a Parameter."
        # check if parameter key is not already in the OrderedDict of parameters
        assert name not in self._parameters, "Parameter {} already exists".format(name)
        self._parameters[name] = param
        
    def param(self, recurse=True, verbose=False):
        """ param returns a dict of Parameters, each composed of a parameter tensor, 
        and a gradient tensor of same size. This list is empty for parameterless modules. """
        
        if recurse == False or isEmpty(self._children):
            if verbose: print("Parameters of module ", self.name)
            yield self._parameters
        else:
            
            for key_mod, module in self._children.items():
                yield module.param(recurse, verbose)
                
                    
    def param_per_module(self):
        if self._parameters:
            yield self._parameters
        else:
            yield None
            
    def param_tree(self, verbose):
        print("In param_tree of module {}._children = {}".format(self.name, self._children))
        for key_mod, module in self._children.items():
                print("params of module = ", key_mod)
                yield module.param_per_module(verbose)
            
def isEmpty(dict):
    if dict: return False
    else: return True

In [3]:
class Sequential(Module):
    def __init__(self, *args):
        super(Sequential, self).__init__('seq_nn')
        for index, module in enumerate(args):
            print("Adding module = {} to children".format(module.name))
            self.add_children(module)
            
    def forward(self, input):
        self.save_for_backward = input
        for key, module in self._children.items():
            #print("Applying module = {}, with key = {}".format(module.name, key))
            input = module(input)
        return input
    
    def backward(self, *grad_output):
        for key, module in reversed(self._children.items()):
            print("In Seq backward, grad_output = ", grad_output)
            grad_output = module.backward(grad_output[0])
            print("In Seq backward, grad_output = ", grad_output)
        return grad_output      

In [4]:
# Each Module may have tensor parameters, for each of which it should also have a 
# similar sized tensor gradient to accumulate the gradient during the backward pass
class Parameter(object):
    def __init__(self, tensor=None, grad=None, requires_grad=True):
        assert tensor is None or isinstance(tensor, torch.Tensor), "Not a tensor"
        self.data = tensor
        self.grad = torch.empty(tensor.size())
        self.requires_grad = requires_grad
    
    def set_data(self, tensor):
        assert tensor is None or isinstance(tensor, torch.Tensor), "Not a tensor"
        self.data = tensor  
    
    def set_grad_zero(self):
        self.grad = torch.zeros(self.grad.size())    

In [5]:
class Linear(Module):
    """ Implements a R^C -> R^D fully-connected layer:
        Input: (N x C) tensor
        Ouput: (N x D) tensor """
    def __init__(self, name, in_features, out_features, bias=True):
        assert name is not None, "Module that have parameters must have a unique name"
        super(Linear, self).__init__(name)
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        self.reset_parameters()
        self.add_parameter('weight', self.weight)
        self.add_parameter('bias', self.bias)
              
    def forward(self, input):
        self.save_for_backward = input
        output = torch.matmul(input, self.weight.data)
        if self.bias: 
            output += self.bias.data
            
        #print("Applying module {}, input = {}, output = {}".format(self.name, input, output))
        return output
              
    def backward(self, grad_output):
        input = self.save_for_backward 
        print("In Linear backward, grad_ouput = ", grad_output)
        print("self.weight.data.t() = ", self.weight.data.t())
        print(grad_output.shape)
        grad_input = torch.matmul(grad_output, self.weight.data.t())
        grad_weight = torch.matmul(input.t(), grad_output)
        self.weight.grad += grad_weight
        if self.bias: 
            grad_bias = grad_output.sum(0).squeeze(0)
            self.bias.grad += grad_bias          

        #print("weight.grad = ", self.weight.grad.t())
        return grad_input 
    
    def reset_parameters(self):
        gain = calculate_gain('linear')
        stdv = gain / math.sqrt(self.in_features)
        bound = math.sqrt(3.0) * stdv
        self.weight.data.uniform_(-bound, bound)
        if self.bias is not None:
            self.bias.data.uniform_(-bound, bound)

In [6]:
class ReLU(Module):
    def __init__(self, name=None):
        if name is None: name = 'relu'
        super(ReLU, self).__init__(name)
    
    def forward(self, input):
        self.save_for_backward = input
        #print("Applying ReLU, output = ", input.clamp(min=0))
        return input.clamp(min=0)
    
    def backward(self, grad_output):
        input = self.save_for_backward
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
    
class Tanh(Module):
    def __init__(self, name=None):
        if name is None: name = 'tanh'
        super(Tanh, self).__init__(name)
        
    def forward(self, input):
        self.save_for_backward = input
        return torch.tanh(input)
    
    def backward(self, grad_output):
        input = self.save_for_backward
        grad_input = 1 - torch.tanh(input)**2
        return grad_input 

In [7]:
class MSELoss(Module):
    def __init__(self, name=None):
        if name is None: name = 'mse'
        super(MSELoss, self).__init__(name)
    
    def forward(self, input, target):
        assert(input.size() == target.size()), "Input size different to target size."
        self.save_for_backward_input = input
        self.save_for_backward_target = target
        se = (input - target)**2
        return torch.mean(se)

    def backward(self, grad_output=None):
        input = self.save_for_backward_input
        target = self.save_for_backward_target
        grad_se = 2*(input - target) / len(input)
        return grad_se

In [8]:
def calculate_gain(nonlinearity='relu'):
    linear_fns = ['linear', 'conv1d']
    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    else:
        raise ValueEroor("Specified non-linearity is not implemented")

In [9]:
import types
class Optimizer(object):
    def __init__(self, model, defaults):
        self.defaults = defaults
        self.model = model
    
    def zero_grad(self):
        for param_dict in self.model.param():
            if isinstance(param_dict, types.GeneratorType): 
                print("IN ZERO_GRAD, param_dict = ", param_dict)
                param_dict = next(param_dict)
                print(param_dict)
            for key, p in param_dict.items():
                if p.grad is not None:
                    p.set_grad_zero()
                
    def step(self, closure):
        raise NotImplementedError

In [10]:
def get_initial_parameters(model):
    i = 0
    for p in model_seq.param(verbose=True): # loop over generator object of each module
        for param_dict in p: # get parameters OrderedDict()
            if param_dict is not None:
                for key, param in param_dict.items(): # loop over the parameters OrderedDict()
                    if key == 'weight':
                        if i == 0: init_weight1 = param.data.t().clone()
                        if i == 1: init_weight2 = param.data.t().clone()    
                    if key == 'bias':
                        if i == 0: 
                            init_bias1 = param.data.clone()
                            i += 1
                        if i == 1: init_bias2 = param.data.clone()
    return init_weight1, init_bias1, init_weight2, init_bias2

In [11]:
class SGD(Optimizer):
    def __init__(self, model, lr=0.01):
        defaults = dict(lr=lr)
        self.lr = lr
        super(SGD, self).__init__(model, defaults)
        
    def step(self, closure):
        loss= None
        if closure is not None:
            loss = closure
    
        for p_group in self.model.param():
            for key, p in p_group.items():
                if p.grad is None:
                    continue
                d_p = p.grad
                p.data -= self.lr*d_p
        
        return loss       

In [12]:
import itertools
model = Linear('fc1', 3, 1)
for p in model.param(verbose=True):
    print("p = ", p)
    for key, param in p.items():
        print("Parameter containing:")
        if key == 'weight': print(param.data.t())
        if key == 'bias': print(param.data)
 

print()
print()
model_seq = Sequential(
    Linear('fc1', 3, 4),
    Linear('fc2', 4, 1),
    ReLU('relu1')
    )

print()
print()
for p in model_seq.param(verbose=True):
    print(p)
    for param_dict in p:
        print(param_dict)
        if param_dict is not None:
            for key, param in param_dict.items():
                print("Parameter containing:")
                if key == 'weight': print(param.data.t())
                if key == 'bias': print(param.data)

Parameters of module  fc1
p =  OrderedDict([('weight', <__main__.Parameter object at 0x7fd30b96bf28>), ('bias', <__main__.Parameter object at 0x7fd30b906da0>)])
Parameter containing:
tensor([[-0.0075,  0.5364, -0.8230]])
Parameter containing:
tensor([-0.7359])


Adding module = fc1 to children
adding child =  <__main__.Linear object at 0x7fd30b96b7f0>
Adding module = fc2 to children
adding child =  <__main__.Linear object at 0x7fd30b906f28>
Adding module = relu1 to children
adding child =  <__main__.ReLU object at 0x7fd30b906f60>


<generator object Module.param at 0x7fd30b962930>
Parameters of module  fc1
OrderedDict([('weight', <__main__.Parameter object at 0x7fd30b906cf8>), ('bias', <__main__.Parameter object at 0x7fd30b906e80>)])
Parameter containing:
tensor([[-0.3852, -0.0887, -0.9553],
        [ 0.2682,  0.2646, -0.6623],
        [-0.0198, -0.3022, -0.4122],
        [ 0.7929, -0.1966,  0.0370]])
Parameter containing:
tensor([ 0.3953,  0.6000, -0.6779, -0.4355])
<generator object 

In [13]:
model = Linear('fc1', 3, 1)
init_weight = torch.zeros([1, 3])
init_bias = torch.zeros([])
for p in model.param():
    print(p)
    for key, param in p.items():
        print("Parameter containing:")
        if key == 'weight': 
            print(param.data.t())
            init_weight = param.data.t().clone()
        if key == 'bias': 
            print(param.data)
            init_bias = param.data.clone()

input = torch.Tensor([[2., 4., 6.], [1., 4., 6.]])
output = model(input)
target = torch.Tensor([[2], [1]])

criterion = MSELoss()
optimizer = SGD(model)

torch.set_grad_enabled(False)
nb_epochs = 5
for e in range(nb_epochs):
    print()
    optimizer.zero_grad()
    output = model(input)
    loss = criterion(output, target)
    print("e = {}, loss = {}".format(e, loss))
    grad_loss_wrt_output = criterion.backward()
    print("grad_loss_wrt_output = ", grad_loss_wrt_output.t())
    model.backward(grad_loss_wrt_output)
    optimizer.step(criterion(output, target))

print()
print()
print()
from torch import nn
from torch import optim
torch.set_grad_enabled(True)
input = torch.Tensor([[2., 4., 6.], [1., 4., 6.]])
target = torch.Tensor([[2], [1]])

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(3, 1)
        self.fc.weight.data = init_weight
        self.fc.bias.data = init_bias
    def forward(self, x):
        x = self.fc(x)
        return x
    
# I want to print the gradient of the loss wrt to the model output. For this I register a backward hook.
def hook(module, gradInput, gradOutput):
    for grad in gradOutput:
        print("grad_loss_wrt_output = ", grad.t())

model_torch = Net()
for p in model_torch.parameters():
    print(p)
model_torch.register_backward_hook(hook)     
criterion_torch = nn.MSELoss()
optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.01)
for e in range(nb_epochs):
    print()
    optimizer_torch.zero_grad()
    output = model_torch(input).requires_grad_()
    loss_torch = criterion_torch(output, target)
    print("e = {}, loss = {}".format(e, loss_torch))
    loss_torch.backward()
    for p in model_torch.parameters():
        print("weight.grad = ", p.grad)
        break
    optimizer_torch.step()
    

OrderedDict([('weight', <__main__.Parameter object at 0x7fd30b96c1d0>), ('bias', <__main__.Parameter object at 0x7fd30b8f8860>)])
Parameter containing:
tensor([[ 0.1058,  0.9055, -0.9277]])
Parameter containing:
tensor([-0.6295])

e = 0, loss = 15.526599884033203
grad_loss_wrt_output =  tensor([[-4.3620, -3.4678]])
In Linear backward, grad_ouput =  tensor([[-4.3620],
        [-3.4678]])
self.weight.data.t() =  tensor([[ 0.1058,  0.9055, -0.9277]])
torch.Size([2, 1])

e = 1, loss = 0.3236342966556549
grad_loss_wrt_output =  tensor([[0.0316, 0.8039]])
In Linear backward, grad_ouput =  tensor([[0.0316],
        [0.8039]])
self.weight.data.t() =  tensor([[ 0.2277,  1.2187, -0.4579]])
torch.Size([2, 1])

e = 2, loss = 0.1539161503314972
grad_loss_wrt_output =  tensor([[-0.4285,  0.3524]])
In Linear backward, grad_ouput =  tensor([[-0.4285],
        [ 0.3524]])
self.weight.data.t() =  tensor([[ 0.2191,  1.1852, -0.5080]])
torch.Size([2, 1])

e = 3, loss = 0.15059930086135864
grad_loss_wrt_ou

In [14]:
from torch import nn
from torch import optim
torch.manual_seed(0)
torch.set_grad_enabled(False)
del nn
del optim
model_seq = Sequential(
    Linear('fc1', 3, 4),
    Linear('fc2', 4, 1),
    ReLU('relu1')
    )

input = torch.Tensor([[2., 4., 6.], [1., 4., 6.]])
target = torch.Tensor([[2], [1]])
model_seq(input)

def get_initial_parameters(model):
    i = 0
    for p in model_seq.param(verbose=True): # loop over generator object of each module
        for param_dict in p: # get parameters OrderedDict()
            if param_dict is not None:
                for key, param in param_dict.items(): # loop over the parameters OrderedDict()
                    if key == 'weight':
                        if i == 0: init_weight1 = param.data.t().clone()
                        if i == 1: init_weight2 = param.data.t().clone()    
                    if key == 'bias':
                        if i == 0: 
                            init_bias1 = param.data.clone()
                            i += 1
                        if i == 1: init_bias2 = param.data.clone()
    return init_weight1, init_bias1, init_weight2, init_bias2
init_weight1, init_bias1, init_weight2, init_bias2 = get_initial_parameters(model_seq)
print(init_weight2)                       

                    
criterion = MSELoss()
optimizer = SGD(model_seq)

torch.set_grad_enabled(False)
nb_epochs = 2
for e in range(nb_epochs):
    print()
    optimizer.zero_grad()
    output = model_seq(input)
    print("e = {}, output = {}".format(e, output))
    loss = criterion(output, target)
    print("e = {}, loss = {}".format(e, loss))
    
    grad_loss_wrt_output = criterion.backward()
    print("grad_loss_wrt_output = ", grad_loss_wrt_output.t())
    
    model_seq.backward(grad_loss_wrt_output)
    optimizer.step(criterion(output, target))
    
print()
print()
print()
from torch import nn
from torch import optim
torch.set_grad_enabled(True)
input = torch.Tensor([[2., 4., 6.], [1., 4., 6.]])
target = torch.Tensor([[2], [1]])


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(3, 4)
        
        print(self.fc1.weight.data.shape)
        self.fc1.weight.data = init_weight1
        
        self.fc2 = nn.Linear(4, 1)
        self.relu = nn.ReLU()
        
        self.fc1.bias.data = init_bias1
        self.fc2.weight.data = init_weight2
        self.fc2.bias.data = init_bias2
    def forward(self, x):
        #print("x0 = ", x)
        x = self.fc1(x)
        #print("x1 = ", x)
        x = self.fc2(x)
        #print("x2 = ", x)
        x = self.relu(x)
        #print("x3 = ", x)
        return x
    
# I want to print the gradient of the loss wrt to the model output. For this I register a backward hook.
def hook(module, gradInput, gradOutput):
    for grad in gradOutput:
        print("grad_loss_wrt_output = ", grad.t())
        
model_torch = Net()
for p in model_torch.parameters():
    print(p)
model_torch.register_backward_hook(hook)     
criterion_torch = nn.MSELoss()
optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.01)
for e in range(nb_epochs):
    print()
    optimizer_torch.zero_grad()
    output = model_torch(input).requires_grad_()
    print("e = {}, output = {}".format(e, output))
    loss_torch = criterion_torch(output, target)
    print("e = {}, loss = {}".format(e, loss_torch))
    loss_torch.backward()
    #for p in model_torch.parameters():
    #    print(p)
    #    print("weight.grad = ", p.grad)
    #    break
    optimizer_torch.step()



Adding module = fc1 to children
adding child =  <__main__.Linear object at 0x7fd30b8f8160>
Adding module = fc2 to children
adding child =  <__main__.Linear object at 0x7fd30b9208d0>
Adding module = relu1 to children
adding child =  <__main__.ReLU object at 0x7fd30b920978>
Parameters of module  fc1
Parameters of module  fc2
Parameters of module  relu1
tensor([[ 0.3424,  0.5196, -0.5871, -0.3771]])

IN ZERO_GRAD, param_dict =  <generator object Module.param at 0x7fd30b962e58>
OrderedDict([('weight', <__main__.Parameter object at 0x7fd30b920898>), ('bias', <__main__.Parameter object at 0x7fd30b9207f0>)])
IN ZERO_GRAD, param_dict =  <generator object Module.param at 0x7fd30b962e58>
OrderedDict([('weight', <__main__.Parameter object at 0x7fd30b920908>), ('bias', <__main__.Parameter object at 0x7fd30b920940>)])
IN ZERO_GRAD, param_dict =  <generator object Module.param at 0x7fd30b962e58>
OrderedDict()
e = 0, output = tensor([[2.9778],
        [1.9408]])
e = 0, loss = 0.920551061630249
grad_l

RuntimeError: size mismatch, [4 x 2], [1] at /opt/conda/conda-bld/pytorch-cpu_1549632688322/work/aten/src/TH/generic/THTensorMath.cpp:821

In [None]:
from torch import nn
from torch import optim
torch.set_grad_enabled(True)
model_seq_torch = nn.Sequential(nn.Linear(3, 4), 
                            nn.Linear(4, 1), 
                            nn.ReLU())
model_seq_torch(input)
for p in model_seq_torch.parameters():
    print(p)

In [None]:
print()
print("MODULE.PARAM")
for key, module in model_seq._children.items():
    print("module.param() = ", module.param())

For the second project, do we first accumulate the gradient then afterwards calculate the derivate of the loss wrt 
to the input.  Or do it the other way around.
They are usually unrelated computations. Think about the following scenario. You have a batch of inputs x_0 to x_9. 
And a single parameter a. Thus the forward pass for this module is s_i = a*x_i. For the backward pass we get as 
input dl/ds_i for all i and we need to compute dl/da and dl/dx_i . It is quite obvious that 
dl/da = sum x_i * dl/ds_i for all i. And dl/dx_i = dl/ds_i * a. The order in which one computes the two is irrelevant.

In [None]:
from torch import nn
from torch import optim
import torch.autograd as autograd

x = torch.tensor([[1, 2], [2, 1], [3, 4]]).type(torch.FloatTensor).requires_grad_()
y = torch.tensor([1, 0.4, 3])
#x = torch.tensor([[1., 2.]]).requires_grad_()
#y = torch.tensor([1.])

model = nn.Sequential(nn.Linear(2, 10), nn.ReLU())

print("PRINTING PARAMETERS")
for p in model.parameters():
    print("p = ", p)
y_pred = model(x)

print("PRINTING PREDICTION")
print("y_pred = ", y_pred)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

loss = criterion(y_pred, y)
optimizer.zero_grad()

print("PRINTING GRADIENT")
#print("loss.grad = ", autograd.grad(loss, x))
loss.backward()
for p in model.parameters():
    print("p.grad = ", p.grad)