In [1]:
import math
import numpy as np
from collections import OrderedDict
#import torch
#from torch import Tensor
#torch.set_grad_enabled(False)

In [2]:
class Module(object):
    """ Base class """
    def __init__(self, name):
        self.name = name
        self._parameters = OrderedDict()
        self._children = OrderedDict()
        self.training = True
        
    def __call__(self, *input, **kwargs):
        return self.forward(*input, **kwargs)
        
    def forward(self, *input):
        raise NotImplementedError
        
    def backward(self, *grad_output):
        """ backward receives as input a pointer to a tensor or a tuple of tensors containing
        the gradient of the loss (or the function of interest) wrt the module's output, accumulates
        the gradient wrt the parameters, and returns a tensor or a tuple of tensors containing the 
        gradient of the loss wrt the module's input (Application of the chain rule)"""
        raise NotImplementedError
        
    def add_children(self, module):
        print("adding child = ", module)
        assert isinstance(module, Module) and module is not None, "Not a Module."
        assert module.name not in self._children, "Module {} already exists".format(module.name)
        self._children[module.name] = module
        
    def add_parameter(self, name, param):
        assert isinstance(param, Parameter), "Not a Parameter."
        assert name not in self._parameters, "Parameter {} already exists".format(name)
        self._parameters[name] = param
        
    def param(self, recurse=True):
        """ param returns a dict of Parameters, each composed of a parameter tensor, 
        and a gradient tensor of same size. This list is empty for parameterless modules. """
        if recurse == False or self._children is not None:
            print("Arrived in leaf module")
            return self.param_per_module()
        #else:
        #    for key_mod, module in self._children.items():
        #        print("Looping over children, module = ", module)
        #        for key_param, parameter in module._parameters:
        #            return param_per_module()
                    
    
    def param_per_module(self):
        if self._parameters:
            yield self._parameters
        else:
            yield None
        

In [3]:
class Sequential(Module):
    def __init__(self, *args):
        super(Sequential, self).__init__('NN')
        for index, module in enumerate(args):
            self.add_children(module)
            
    def forward(self, input):
        print("In Sequential.forward")
        self.save_for_backward = input
        for key, module in self._children.items():
            input = module(input)
        return input
    
    def backward(self, *grad_ouput):
        for key, module in self._children.items():
            grad_output = module.backward(grad_output)
        return grad_ouput
        

In [4]:
# Each Module may have tensor parameters, for each of which it should also have a 
# similar sized tensor gradient to accumulate the gradient during the backward pass
class Parameter(object):
    def __init__(self, tensor=None, grad=None, requires_grad=True):
        #assert tensor is None or isinstance(tensor, torch.Tensor), "Not a tensor"
        assert tensor is None or isinstance(tensor, np.ndarray), "Not a tensor"
        self.data = tensor
        #self.grad = torch.empty(tensor.size())
        self.grad = np.empty(tensor.size)
        self.requires_grad = requires_grad
    
    def set_data(self, tensor):
        assert tensor is None or isinstance(tensor, np.ndarray), "Not a tensor"
        self.data = tensor  
    
    def set_grad_zero(self):
        print("setting grad of {} to zero".format(self))
        #self.grad = torch.zeros(self.grad.size())
        print("self.grad.size = ", self.grad.size)
        self.grad = np.zeros(self.grad.size)
    

In [5]:
class Linear(Module):
    """ Implements a R^C -> R^D fully-connected layer:
        Input: (N x C) tensor
        Ouput: (N x D) tensor """
    def __init__(self, name, in_features, out_features, bias=True):
        super(Linear, self).__init__(name)
        self.in_features = in_features
        self.out_features = out_features
        #self.weight = Parameter(torch.Tensor(out_features, in_features))
        self.weight = Parameter(np.empty((out_features, in_features)))
        if bias:
        #    self.bias = Parameter(torch.Tensor(out_features))
            self.bias = Parameter(np.empty(out_features))
        self.reset_parameters()
        self.add_parameter('weight', self.weight)
        self.add_parameter('bias', self.bias)
              
    def forward(self, input):
        #print("Applying module {} with input = {}".format(self.name, input))
        self.save_for_backward = input
        #output = torch.matmul(input, self.weight.data.t())
        output = np.dot(input, self.weight.data.transpose())
        if self.bias: 
            output += self.bias.data
        return output
              
    def backward(self, grad_output):
        input = self.save_for_backward
        #grad_input = torch.matmul(grad_output, self.weight.data)
        #grad_weight = torch.matmul(grad_output.t(), input)
        grad_input = np.outer(grad_output.transpose(), self.weight.data)
        grad_weight = np.dot(input.transpose(), grad_output.transpose())
        self.weight.grad += grad_weight
        if self.bias: 
            grad_bias = grad_output.sum(0).squeeze(0)
            self.bias.grad += grad_bias
        return grad_input 
    
    def reset_parameters(self):
        gain = calculate_gain('linear')
        stdv = gain / math.sqrt(self.in_features)
        bound = math.sqrt(3.0) * stdv
        #self.weight.data.uniform_(-bound, bound)
        #print("self.weight = ", self.weight)
        self.weight.data = np.random.uniform(-bound, bound, self.weight.data.size)
        if self.bias is not None:
            #self.bias.data.uniform_(-bound, bound)
            self.bias.data = np.random.uniform(-bound, bound, self.bias.data.size)

In [6]:
class ReLU(Module):
    def __init__(self, name):
        super(ReLU, self).__init__(name)
    
    def forward(self, input):
        print("Applying module {} with input = ".format(self.name, input))
        self.save_for_backward = input
        return input.clamp(min=0)
    
    def backward(self, grad_output):
        input = self.save_for_backward
        grad_input = grad_output.copy()
        grad_input[input < 0] = 0
        return grad_input

In [7]:
class MSELoss(Module):
    def __init__(self, name=None):
        if name is None: name = 'mse'
        super(MSELoss, self).__init__(name)
    
    def forward(self, input, target):
        assert(input.size == target.size), "Input size different to target size."
        self.save_for_backward_input = input
        self.save_for_backward_target = target
        se = (input - target)**2
        #return torch.mean(se)
        return np.mean(se)
    
    def backward(self, grad_ouput=None):
        input = self.save_for_backward_input
        target = self.save_for_backward_target
        grad_se = 2*(input - target) / len(input)
        #return torch.mean(grad_se)
        return grad_se
        
        

In [8]:
def calculate_gain(nonlinearity='relu'):
    linear_fns = ['linear', 'conv1d']
    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    else:
        raise ValueEroor("Specified non-linearity is not implemented")

In [9]:
class Optimizer(object):
    def __init__(self, params, defaults):
        self.defaults = defaults
        #assert isinstance(params, Parameter) or isinstance(params, dict), ("params argument should be a dict of Parameter, but is = {}").format(params.type)
        self.params = params
    
    def zero_grad(self):
        print("in zero_grad, self.params = ", self.params)
        for p_group in self.params:
            print("p_group = ", p_group)
            for key, p in p_group.items():
                print("p = ", p)
                print("p.grad = ", p.grad)
                print("p.grad.size = ", p.grad.size)
                if p.grad is not None:
                    p.set_grad_zero()
                
    def step(self, closure):
        raise NotImplementedError

In [10]:
class SGD(Optimizer):
    def __init__(self, params, lr=0.001):
        defaults = dict(lr=lr)
        super(SGD, self).__init__(params, defaults)
        
    def step(self, closure):
        loss= None
        if closure is not None:
            loss = closure
    
        for p in self.params:
            print("p = ", p)
            if p.grad is None:
                continue
            d_p = p.grad
            print("d_p = ", d_p)
            p.data -= lr*d_p
        
        return loss       

In [11]:
model = Linear('fc1', 3, 1)
for p in model.param():
    print("parameter p = ", p)
    for key, param in p.items():
        print("data = ", param.data)

Arrived in leaf module
parameter p =  OrderedDict([('weight', <__main__.Parameter object at 0x7f03f0311208>), ('bias', <__main__.Parameter object at 0x7f03f0311400>)])
data =  [ 0.02483379 -0.9824095   0.90153609]
data =  [0.50886631]


In [12]:
model = Linear('fc1', 3, 1)
for p in model.param():
    print("parameter p = ", p)
    for key, param in p.items():
        print("data = ", param.data)

print()
print("MODULE.PARAM")
print(model.param())

print()
print("MODULE.__call__")
#input = torch.Tensor([[2, 4, 6], [1, 4, 6]])
input = np.array([[2, 4, 6], [1, 4, 6]])
output = model(input)
print("output = ", output)
#target = torch.Tensor([[2], [1]])
target = np.array([2, 1])

criterion = MSELoss()
optimizer = SGD(model.param())

#torch.set_grad_enabled(False)
print()
nb_epochs = 10
for e in range(nb_epochs):
    print()
    print("new epoch, e = ", e)
    for p in model.param():
        print("parameter p = ", p)
        for key, param in p.items():
            print("data = ", param.data)
            
    print()
    print("calling zero_grad")
    optimizer.zero_grad()
    print()
    
    output = model(input)
    loss = criterion(output, target)
    print("loss = ", loss)
    grad_output = criterion.backward()
    model.backward(grad_output)
    optimizer.step(criterion(output, target))

print()
print()
grad_output = criterion.backward()
print("grad_output = ", grad_output)
model.backward(grad_output)





Arrived in leaf module
parameter p =  OrderedDict([('weight', <__main__.Parameter object at 0x7f03f02bc748>), ('bias', <__main__.Parameter object at 0x7f03f02bc7b8>)])
data =  [-0.17404008  0.88734597 -0.42960771]
data =  [0.060524]

MODULE.PARAM
Arrived in leaf module
<generator object Module.param_per_module at 0x7f03f031d390>

MODULE.__call__
output =  [0.68418149 0.85822157]
Arrived in leaf module


new epoch, e =  0
Arrived in leaf module
parameter p =  OrderedDict([('weight', <__main__.Parameter object at 0x7f03f02bc748>), ('bias', <__main__.Parameter object at 0x7f03f02bc7b8>)])
data =  [-0.17404008  0.88734597 -0.42960771]
data =  [0.060524]

calling zero_grad
in zero_grad, self.params =  <generator object Module.param_per_module at 0x7f03f031d390>
p_group =  OrderedDict([('weight', <__main__.Parameter object at 0x7f03f02bc748>), ('bias', <__main__.Parameter object at 0x7f03f02bc7b8>)])
p =  <__main__.Parameter object at 0x7f03f02bc748>
p.grad =  [ 0.02483379 -0.9824095   0.901

array([[ 0.22900516, -1.16758626,  0.56528577],
       [ 0.02467513, -0.12580652,  0.06090911]])

In [13]:
model = Sequential(
    Linear('fc1', 2, 4),
    Linear('fc2', 4, 1),
    ReLU('relu1')
    )

print("MODULE.__call__")
model(x)

print()
print("MODULE.PARAM")
for key, module in model._children.items():
    print("module.param() = ", module.param())

adding child =  <__main__.Linear object at 0x7f03f02ae240>
adding child =  <__main__.Linear object at 0x7f03f02ae320>
adding child =  <__main__.ReLU object at 0x7f03f02ae2b0>
MODULE.__call__


NameError: name 'x' is not defined

For the second project, do we first accumulate the gradient then afterwards calculate the derivate of the loss wrt 
to the input.  Or do it the other way around.
They are usually unrelated computations. Think about the following scenario. You have a batch of inputs x_0 to x_9. 
And a single parameter a. Thus the forward pass for this module is s_i = a*x_i. For the backward pass we get as 
input dl/ds_i for all i and we need to compute dl/da and dl/dx_i . It is quite obvious that 
dl/da = sum x_i * dl/ds_i for all i. And dl/dx_i = dl/ds_i * a. The order in which one computes the two is irrelevant.

In [None]:
from torch import nn
from torch import optim
import torch.autograd as autograd

x = torch.tensor([[1, 2], [2, 1], [3, 4]]).type(torch.FloatTensor).requires_grad_()
y = torch.tensor([1, 0.4, 3])
#x = torch.tensor([[1., 2.]]).requires_grad_()
#y = torch.tensor([1.])

model = nn.Sequential(nn.Linear(2, 10), nn.ReLU())

print("PRINTING PARAMETERS")
for p in model.parameters():
    print("p = ", p)
y_pred = model(x)

print("PRINTING PREDICTION")
print("y_pred = ", y_pred)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

loss = criterion(y_pred, y)
optimizer.zero_grad()

print("PRINTING GRADIENT")
#print("loss.grad = ", autograd.grad(loss, x))
loss.backward()
for p in model.parameters():
    print("p.grad = ", p.grad)