In [1]:
import torch
from torch import Tensor
import numpy as np

# Suggestion on Naming

    layer_par    :  Layer    (it's a class)
    forward_par  :  Forward  (class)
    backward_par :  Backward (class)
    
    
    make_arch    :  makeArhc (it's a method in Net class)
    forward_list :  fw_list  (attribute list)
    backward_list:  bw_list  (attribute list)
    

# Auxiliary Classes

In [3]:
class layer_par:   # Layer
    """
    This class contains parameters of each layer. We initialize them in constructor.
    
    inputs:
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer
        
    returns:
        Nothing
    """
    
    def __init__(self, dim_in, dim_out):
        self.dim_in  = dim_in
        self.dim_out = dim_out
        self.b = Tensor(dim_out,1).fill_(0)         # bias of each layer
        self.w = Tensor(dim_out,dim_in).normal_()   # w of each layer
    # TODO: I suggest that we add activation to this as well
    
    
class forward_par:    # change the name to: "ForwardAns
    """
    This class keeps track of all the variables produced in forward pass of some layer. i.e, x and s.
    
    inputs:
        N           :  number of data
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer
        
    returns:
        Nothing
    """
    def __init__(self, dim_out, N):
        self.s = Tensor(N,dim_out).fill_(0)           # s after each layer
        self.x = Tensor(N,dim_out).fill_(0)           # x after each layer:   x = Activation (s)
        self.N = N
        
class backward_par:   # or backward par     #BackwardAns
    """
    This class keeps track of all the variables we need to evaluate the damn gradients for each layer...
    
    inputs:
        N           :  number of data
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer

    """
    #TODO: check the size of db
    def __init__(self, dim_in, dim_out, N):
        self.db = Tensor(N, dim_out,1).fill_(0)         # dL/db
        self.dw = Tensor(N, dim_out,dim_in).fill_(0)    # dL/dw      # WARNING: out*in or in*out
        self.ds = Tensor(N, dim_out).fill_(0)           # dL/ds
        self.dx = Tensor(N, dim_out).fill_(0)           # dL/dx
        
        
class Linear:
    """
    An class that contains objects which only store layar's in/out connections dimension
    
    input_s:
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer
        
    returns:
        Nothing 
    """
    
    def __init__(self,dim_in, dim_out):
        self.input_ = dim_in
        self.output_ = dim_out          
    # TODO: The linear is really wierd thing... all we get here is already in the upper class. we may omit this somehow

        
def Activation(code,input_):
    """
    A class that specify the needed activation with respect to the following code conversion
        0: Relu(x)
        1: Tanh(x)
        2: Sigmoid(x)
    
    This class works as functional package of pytorch
    
    input_s:
        code        :  the code for each activation (0,1,2)
        input__tensor:  the input_ tensor
        
    returns:
        result      :  the output_ of requested activation function with the same shape as input_ tensor
    """
    result = Tensor(input_.shape)
    # Relu
    if code ==0:
        result = input_ - (input_<0).float()*input_
    # Tanh
    elif code ==1:
        result = torch.tanh(input_)
    # Sig
    elif code ==2:
        result = 1.0/(1 + torch.exp(-input_))
    # linear
    elif code ==3:
        result = input_
    # error
    else: raise ValueError('Unknown Code For Activation')
        
    return result 


def dActivation(code,input_):
    """
    A class that specify the needed derivative of activation with the same encoding convenstion
        0: dRelu(x)
        1: dTanh(x)
        2: dSigmoid(x)
    
    This class works as functional package of pytorch
    
    input_s:
        code        :  the code for each activation (0,1,2)
        input__tensor:  the input_ tensor
        
    returns:
        result      :  the output_ of requested activation function with the same shape as input_ tensor
    """
    result = Tensor.new(input_)
    # dRelu
    if code ==0:
        result = Tensor(input_.shape).fill_(1.0) - (input_<=0).float()*Tensor(input_.shape).fill_(1.0)
    # dTanh
    elif code ==1:
        result = 1-(torch.tanh(input_))**2
    # dSig
    elif code ==2:
        result = Activation(code,input_)*(1-Activation(code,input_))
    # linear
    elif code ==3:
        result = Tensor(input_.shape).fill_(1.0)
    else: raise ValueError('Unknown Code For derivative of Activation')
    
    return result 



# Loss and SGD

In [4]:
def loss(v, t):
    l_ = torch.sum(torch.pow(v-t,2))/(len(v))
    return l_

def dloss(v,t):
    return 2.*(v-t)


# Network Class

In [5]:
class Net:
    """
    The network class. It has the following methods:
        param      :  returns the parameter which is asked for. Not the data! The object... 
                        Data is accessible through object.data method)
        make_arch  :  makes the architecture of the network by taking a sequential list of [fc1,act1,fc2,act2,...]
    """
    
    #TODO:
    
    
    def __init__(self, seq,X,Y=None):
        # the lists below keep the following objects: forward_par, backward_par for each single layer.
        # Hence these list are as long as the nuber of layers +1 (for input).
        self.forward_list  = [] # Stores forward pass parameters of each layer (s,x).   
        self.backward_list = [] # Stores backward pass parameters of each layer (grads:dL/ds, dL/dx, dL/dW, dL/db)
        
        
        # the lists below, are as long as number of layers. They store the following objects:layer_par, type of 
        # activation requested by user.  
        self.act_list   = []    # stores the requested activation functions in codes. Elements are "0","1" or "2"
        self.param_list = []    # Stores parameters of each layer (W,b).                       
    
        self.n_layer = 0        # keeps the number of layers of network
        self.N = len(X)         # nb of batch
        
        self.grad_list     = [] # Stores dL/dW and dL/db for all batch data at each layer
        
        # this method, builds the network and it is called automatically by constructing Net model.
        self.make_arch(seq)     # makes the architecture based on the the list "seq"
  
    
    def make_arch(self,seq):    # makeArch
        """
        This function fills param_list and act_list and also, evaluates number of layers.
        
        input:
            seq : a list that contains both activation and layers in a sequential manner
                  Example: [Linear(3,5), 'relu', Linear(5,64), 'Sig', Linear(64,1), 'Tanh']
        """
        
        seq_len = len(seq)                  # number of layer *2 (because of the activations...)
        self.n_layer = seq_len//2           # I just want to have it :)
        
        for layer in list(range(0,seq_len,2)):
            
            # seq[layer] is an instance of object "Linear". Here we get the in/out dim of the layer
            dim_in, dim_out = seq[layer].input_ , seq[layer].output_ 
            
            # making a new laye_par instance and adding it to the param_list
            self.param_list.append (layer_par(dim_in, dim_out) ) 
            
            # activation recognition : encode activations in "act_list"
            if seq[layer+1]=='relu':
                self.act_list.append(0)
            elif seq[layer+1]=='tanh':
                self.act_list.append(1)
            elif seq[layer+1]=='sig':
                self.act_list.append(2)
            elif seq[layer+1]=='lin':
                self.act_list.append(3)
            else: raise ValueError('Unknown Activation')
                    
    
    def forward(self,X): 
        """
        This method evaluates the forward pass and returns the values. This function is written such that it take a
        batch input and returns the forward pass of the batch.
        
        input: 
            X    :   a tensor of size(B, d_in) 
            
        returns:
            s    :   a tensor of size(B, d_out) 
            x    :   a tensor of size(B, d_out) 
        """
        
        x=X
        for layer, prm in enumerate(self.param_list):   # layer = [0,1,2,...,nb_layer -1]  ;  prm = param_list[layer]
            s = (x.mm(prm.w.t()) + prm.b.t())           # written consistant for batch :  s = (Wx+b).t()  (N,d_out) 
            x = Activation(self.act_list[layer], s)     # size = (N,d_out)       
        return x,s
    
        """
        Hint:
            prm.w.shape = (d_out, d_in)
            prm.b.shpae = (d_out, 1)
            X.shape     = (N,d_in)
            x.shape     = (N,d_out)
            s.shape     = (N.d_out)
        """
    
    
    
    def backward (self,X,Y):
        """
        This method fill both forward_list and backward_list by constructing instances of forward_par and backward_par.
        The object forward_par, contains (s,x), and the object backward_par contains the gradients with respect to para-
        meteres of each layer.
        
        These object contain the gradients or parameters of to all batch data. The sum of all parameters determines the
        total gradient of the batch. This summation is stored in the grad_list.
        
        input:
            X        :   training set of size (B,d)
            Y        :   target set of size (B,d)
            
        returns:
            grad_list:   a list of objects wich contain total gradient of the loss function wrt W and b for batch data
        """
        
        # a list of length = (number of layers + 1)
        self.backward_list = list(range(self.n_layer+1))     # will be filled with backward_par objects later
        
        
        # add X0 to the forward list
        self.forward_list.append( forward_par(self.param_list[0].dim_in, self.N) ) # Note that dim = dim_in for inputs
        self.forward_list[0].x =X     # x0
        #self.forward_list[0].s =X    # s0 is set to be x0
        
        
        # this computes forward pass and saves s and x
        x=X
        for layer, prm in enumerate(self.param_list): # layer = [0,1,2,...,nb_layer -1]  ; prm = param_list[layer]  
            self.forward_list.append( forward_par(prm.dim_out, self.N) ) # making a constructor for forward parameters

            s = (x.mm(prm.w.t()) + prm.b.t())         # written consistant for batch: s.shape = (N,d_out)
            self.forward_list[layer+1].s = s             
            x = Activation(self.act_list[layer], s)    
            self.forward_list[layer+1].x = x
            
            
        # this computes backward
        for layer in list(range (self.n_layer,0,-1)): # layer=[nb_layer, nb_layer-1 , ..., 1]
            
            # this guy makes a backward_par (dim_in,dim_out, N) at component "layer" of backward_list
            self.backward_list[layer] = backward_par( self.param_list[layer-1].dim_in , self.param_list[layer-1].dim_out ,self.N)
            """
            hint: each object in the list above has the following attributes:
                db = Tensor(N, dim_out,1)         # dL/db
                dw = Tensor(N, dim_out,dim_in)    # dL/dw
                ds = Tensor(N, dim_out)           # dL/ds
                dx = Tensor(N, dim_out)           # dL/dx
            """
            
            # dl/dx : size = N,d_out
            if layer == self.n_layer:
                self.backward_list[layer].dx = dloss(x,Y)   
            else:
                self.backward_list[layer].dx = self.backward_list[layer+1].ds.mm(self.param_list[layer].w)
            """
            hint:
                dloss(x,Y)                 = (N, d_out_{last_layer})
                backward_list[layer+1].ds  = (N, d_out_{layer+1})
                param_list[layer].w        = (d_out_{layer+1}, d_in_{layer+1}) = (d_out_{layer+1}, d_out_{layer}) 
            """
                
            #dl/ds : size = N,d_out
            self.backward_list[layer].ds = self.backward_list[layer].dx * dActivation(self.act_list[layer-1], self.forward_list[layer].s ) 
            """
            hint:
                backward_list[layer].dx  = (N, d_out)
                self.act_list[layer-1]   : activation type of layer = layer (0,1,2 or 3)
                forward_list[layer].s    = (N, d_out)
                dActivation (code, s )   = (N, d_out)
            """
            
            #dl/dw : size = (N, d_out, d_in)
            ds_unsq = self.backward_list[layer].ds.unsqueeze(1)    
            x_unsq  = self.forward_list[layer-1].x.unsqueeze(2)
            self.backward_list[layer].dw = ds_unsq * x_unsq        # enjoying broadcasting magic
            """
            hint:
                backward_list[layer].ds        = (N, d_out)
                self.forward_list[layer-1].x   = (N, d_in)
                ds_unsq                        = (N, 1, d_out)
                dx_unsq                        = (N, d_in, 1)
                backward_list[layer].dw        = (N, d_out, d_in)
            """
            
            #dl/db : size = (N, d_out)
            self.backward_list[layer].db = self.backward_list[layer].ds
            """
            hint:
                backward_list[layer].ds        = (N, d_out)
                backward_list[layer].db        = (N, d_out)
            """
           
        # summing all batch grads
        for layer in list (range(0,self.n_layer)):
            # the list below contains the same object param_list, but instead of s and w, we have stored the 
            # gradients wrt them. We did this trick to avoid defining a new class.
            self.grad_list.append(layer_par(self.param_list[layer].dim_in,self.param_list[layer].dim_out) )
            
            self.grad_list[layer].b = self.backward_list[layer+1].db.sum(0)/self.N   # watchout! these are grad!
            self.grad_list[layer].w = self.backward_list[layer+1].dw.sum(0)/self.N   # watchout! these are grad!
        
        return self.grad_list


# Draft, tests, and other stuff

In [6]:
x = Tensor(7).normal_()
X = torch.cat((x.view(1,-1),x.view(1,-1)*3,x.view(1,-1)*x.view(1,-1)),0)
Y= Tensor(3,10).normal_();

In [7]:
seq = [Linear(7,2),'lin',Linear(2,5),'relu',Linear(5,10),'lin']    
model = Net(seq,X,Y)

In [8]:
win = model.backward(X,Y)

In [9]:
win[2].w



Columns 0 to 7 
 -6.6973  -6.3069   4.0240   2.1851   5.2115  -0.1535  -8.4028   1.3085
-24.3491 -22.9300  14.6298   7.9443  18.9472  -0.5580 -30.5498   4.7571
  0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000
-10.2483 -31.1057   4.7990  17.5017   0.0273  27.3381 -17.7607  20.3176
 -2.3130  -2.1782   1.3897   0.7546   1.7998  -0.0530  -2.9020   0.4519

Columns 8 to 9 
  8.0538  -5.7240
 29.2809 -20.8107
  0.0000   0.0000
 22.3448  26.3268
  2.7815  -1.9769
[torch.FloatTensor of size 5x10]

In [10]:
model.forward(X)

(
 -0.2821 -2.2834 -0.1608  1.1885 -0.1352  2.1024 -1.3489  0.9070  1.5557  1.9681
 -0.8463 -6.8502 -0.4825  3.5655 -0.4056  6.3072 -4.0466  2.7209  4.6672  5.9043
 -5.8535 -6.5176  3.9885  2.7681  5.6773  1.2012 -7.6615  0.9167  9.4364 -5.8639
 [torch.FloatTensor of size 3x10], 
 -0.2821 -2.2834 -0.1608  1.1885 -0.1352  2.1024 -1.3489  0.9070  1.5557  1.9681
 -0.8463 -6.8502 -0.4825  3.5655 -0.4056  6.3072 -4.0466  2.7209  4.6672  5.9043
 -5.8535 -6.5176  3.9885  2.7681  5.6773  1.2012 -7.6615  0.9167  9.4364 -5.8639
 [torch.FloatTensor of size 3x10])

In [11]:
b= Tensor(2,4).normal_()

In [13]:
range(1,5)

range(1, 5)

In [14]:
a= Tensor (3,1,5).normal_()
b= Tensor (3,2,1).normal_()

In [15]:
a*b


(0 ,.,.) = 
 -0.0916 -0.2660  0.4003 -0.2421 -0.7329
  0.0676  0.1963 -0.2955  0.1787  0.5411

(1 ,.,.) = 
  2.7904  4.0602 -4.9692 -2.6654 -4.8327
  0.4325  0.6293 -0.7702 -0.4131 -0.7490

(2 ,.,.) = 
  0.9442  0.1140 -0.9457  0.0882  0.9926
  0.7680  0.0927 -0.7692  0.0717  0.8073
[torch.FloatTensor of size 3x2x5]