In [1]:
import torch
from torch import Tensor
import numpy as np

# Auxiliary Classes

In [20]:
class layer_par:   # Layer
    """
    This class contains parameters of each layer. We initialize them in constructor.
    
    inputs:
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer
        
    returns:
        Nothing
    """
    
    def __init__(self, dim_in, dim_out):
        self.dim_in  = dim_in
        self.dim_out = dim_out
        self.b = Tensor(dim_out,1).fill_(0)         # bias of each layer
        self.w = Tensor(dim_out,dim_in).normal_()   # w of each layer
    # TODO: I suggest that we add activation to this as well
    
    
class forward_par:    # change the name to: "ForwardAns
    """
    This class keeps track of all the variables produced in forward pass of some layer. i.e, x and s.
    
    inputs:
        N           :  number of data
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer
        
    returns:
        Nothing
    """
    def __init__(self, dim_out, N):
        self.s = Tensor(N,dim_out).fill_(0)           # s after each layer
        self.x = Tensor(N,dim_out).fill_(0)           # x after each layer:   x = Activation (s)
        self.N = N
        
class backward_par:   # or backward par     #BackwardAns
    """
    This class keeps track of all the variables we need to evaluate the damn gradients for each layer...
    
    inputs:
        N           :  number of data
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer

    """
    #TODO: check the size of db
    def __init__(self, dim_in, dim_out, N):
        self.db = Tensor(N, dim_out,1).fill_(0)         # dL/db
        self.dw = Tensor(N, dim_out,dim_in).fill_(0)    # dL/dw      # WARNING: out*in or in*out
        self.ds = Tensor(N, dim_out).fill_(0)           # dL/ds
        self.dx = Tensor(N, dim_out).fill_(0)           # dL/dx
        
        
class Linear:
    """
    An class that contains objects which only store layar's in/out connections dimension
    
    input_s:
        dim_in      :  the input_ dimension of fully connected layer
        dim_out     :  the output_ dimension of fully connected layer
        
    returns:
        Nothing 
    """
    
    def __init__(self,dim_in, dim_out):
        self.input_ = dim_in
        self.output_ = dim_out          
    # TODO: The linear is really wierd thing... all we get here is already in the upper class. we may omit this somehow

        
def Activation(code,input_):
    """
    A class that specify the needed activation with respect to the following code conversion
        0: Relu(x)
        1: Tanh(x)
        2: Sigmoid(x)
    
    This class works as functional package of pytorch
    
    input_s:
        code        :  the code for each activation (0,1,2)
        input__tensor:  the input_ tensor
        
    returns:
        result      :  the output_ of requested activation function with the same shape as input_ tensor
    """
    result = Tensor(input_.shape)
    # Relu
    if code ==0:
        result = input_ - (input_<0).float()*input_
    # Tanh
    elif code ==1:
        result = torch.tanh(input_)
    # Sig
    elif code ==2:
        result = 1.0/(1 + torch.exp(-input_))
    # linear
    elif code ==3:
        result = input_
    # error
    else: raise ValueError('Unknown Code For Activation')
        
    return result 


def dActivation(code,input_):
    """
    A class that specify the needed derivative of activation with the same encoding convenstion
        0: dRelu(x)
        1: dTanh(x)
        2: dSigmoid(x)
    
    This class works as functional package of pytorch
    
    input_s:
        code        :  the code for each activation (0,1,2)
        input__tensor:  the input_ tensor
        
    returns:
        result      :  the output_ of requested activation function with the same shape as input_ tensor
    """
    result = Tensor.new(input_)
    # dRelu
    if code ==0:
        result = Tensor(input_.shape).fill_(1.0) - (input_<=0).float()*Tensor(input_.shape).fill_(1.0)
    # dTanh
    elif code ==1:
        result = 1-(torch.tanh(input_))**2
    # dSig
    elif code ==2:
        result = Activation(code,input_)*(1-Activation(code,input_))
    # linear
    elif code ==3:
        result = Tensor(input_.shape).fill_(1.0)
    else: raise ValueError('Unknown Code For derivative of Activation')
    
    return result 



# Loss and SGD

In [21]:
def loss(v, t):
    l_ = torch.sum(torch.pow(v-t,2))/(len(v))
    return l_

def dloss(v,t):
    return 2.*(v-t)


# Network Class

In [22]:
class Net:
    """
    The network class. It has the following methods:
        param      :  returns the parameter which is asked for. Not the data! The object... 
                        Data is accessible through object.data method)
        make_arch  :  makes the architecture of the network by taking a sequential list of [fc1,act1,fc2,act2,...]
    """
    
    #TODO:
    
    
    def __init__(self, seq,X,Y=None):
        # the lists below keep the following objects: forward_par, backward_par for each single layer.
        # Hence these list are as long as the nuber of layers +1 (for input).
        self.forward_list  = [] # Stores forward pass parameters of each layer (s,x).   
        self.backward_list = [] # Stores backward pass parameters of each layer (grads:dL/ds, dL/dx, dL/dW, dL/db)
        
        
        # the lists below, are as long as number of layers. They store the following objects:layer_par, type of 
        # activation requested by user.  
        self.act_list   = []    # stores the requested activation functions in codes. Elements are "0","1" or "2"
        self.param_list = []    # Stores parameters of each layer (W,b).                       
    
        self.n_layer = 0        # keeps the number of layers of network
        self.N = len(X)         # nb of batch
        
        self.grad_list     = [] # Stores dL/dW and dL/db for all batch data at each layer
        
        # this method, builds the network and it is called automatically by constructing Net model.
        self.make_arch(seq)     # makes the architecture based on the the list "seq"

        
"""    # a user-friendly-named method fo access w,b and s at each layer
    def get_param_of_layer(self,layer):  
        return self.param_list[layer]
    
    # a user-friendly-named method fo access grad w.r.t. w,b and s at each layer
    def get_forward_par_of_layer(self,layer):
        return self.forward_list[layer]
    
    # a user-friendly-named method fo access grad w.r.t. w,b and s at each layer
    def get_grad_of_layer(self,layer):
        return self.backward_list[layer]
"""        
    
    def make_arch(self,seq):    # makeArch
        """
        This function fills param_list and act_list and also, evaluates number of layers.
        
        input:
            seq : a list that contains both activation and layers in a sequential manner
                  Example: [Linear(3,5), 'relu', Linear(5,64), 'Sig', Linear(64,1), 'Tanh']
        """
        
        seq_len = len(seq)                  # number of layer *2 (because of the activations...)
        self.n_layer = seq_len//2           # I just want to have it :)
        
        for layer in list(range(0,seq_len,2)):
            
            # seq[layer] is an instance of object "Linear". Here we get the in/out dim of the layer
            dim_in, dim_out = seq[layer].input_ , seq[layer].output_ 
            
            # making a new laye_par instance and adding it to the param_list
            self.param_list.append (layer_par(dim_in, dim_out) ) 
            
            # activation recognition : encode activations in "act_list"
            if seq[layer+1]=='relu':
                self.act_list.append(0)
            elif seq[layer+1]=='tanh':
                self.act_list.append(1)
            elif seq[layer+1]=='sig':
                self.act_list.append(2)
            elif seq[layer+1]=='lin':
                self.act_list.append(3)
            else: raise ValueError('Unknown Activation')
                
    
    
    def forward(self,X): 
        """
        This method evaluates the forward pass and returns the values. This function is written such that it take a
        batch input and returns the forward pass of the batch.
        
        input: 
            X    :   a tensor of size(B, d_in) 
            
        returns:
            s    :   a tensor of size(B, d_out) 
            x    :   a tensor of size(B, d_out) 
        
        ------------ single data ------------
        s = Wx + b                            x: column vector
        x = Activation(s)       
        
        ------------ batch data ------------
        s = x W_{transpose} + b               x: a matrix, each row is a sigle data
        x = Activation (s)
        """
        
        x=X
        for layer, prm in enumerate(self.param_list):   # prm = param_list[layer]  
            s = (x.mm(prm.w.t()) + prm.b.t())           # written consistant for batch :  s = (Wx+b).t()  (N,d_out) 
            x = Activation(self.act_list[layer], s)     # size = (N,d_out)       
        return x,s
    
    
    def backward (self,X,Y):
        """
        This method fill both forward_list and backward_list by constructing instances of forward_par and backward_par.
        The object forward_par, contains (s,x), and the object backward_par contains the gradients with respect to para-
        meteres of each layer.
        
        These object contain the gradients or parameters of to all batch data. The sum of all parameters determines the
        total gradient of the batch. This summation is stored in the grad_list.
        
        input:
            X        :   training set of size (B,d)
            Y        :   target set of size (B,d)
            
        returns:
            grad_list:   a list of objects wich contain total gradient of the loss function wrt W and b for batch data
        """
        
        
        
        #Hint:
        # for backward_list and forward_list, the index starts form 0 to L (inclusive). The index 0 indicates the input.
        
        
        # a list of length = (number of layers + 1). It is filled by numbers for the moment but it will be filled
        # with backward_par objects later.
        self.backward_list = list(range(self.n_layer+1))     # will be filled with backward_par objects later
        
        # add X0 to the forward list
        self.forward_list.append( forward_par(self.param_list[0].dim_in, self.N) )    # Note that dim = dim_in for inputs
        self.forward_list[0].x =X    # x0
        #self.forward_list[0].s =X    # s0 is set to be x0
        
        
        # this computes forward
        # TODO: can we use the forward method here?
        x=X
        for layer, prm in enumerate(self.param_list):       # parameter = param[layer]  
            self.forward_list.append( forward_par(prm.dim_out, self.N) )

            s = (x.mm(prm.w.t()) + prm.b.t())           # written consistant for batch :  s = (Wx+b).t()
            self.forward_list[layer+1].s = s             
            x = Activation(self.act_list[layer], s)    
            self.forward_list[layer+1].x = x
            
            
            
        # this computes backward
        for layer in list(range (self.n_layer,0,-1)):
            #print ('layer is {}'.format(layer))
            # this guy makes a backward_par (dim_in,dim_out, N) at component "layer" of backward_list
            self.backward_list[layer] = backward_par( self.param_list[layer-1].dim_in , self.param_list[layer-1].dim_out ,self.N)
            
            #dl/dx 
            # WARNING
            if layer == self.n_layer:
                self.backward_list[layer].dx = dloss(x,Y) 
            else:
                #self.backward_list[layer].dx = self.param_list[layer].w.t().mm(self.backward_list[layer+1].ds)
                self.backward_list[layer].dx = self.backward_list[layer+1].ds.mm(self.param_list[layer].w)
                
                
            #dl/ds
            #print (self.backward_list[layer].dx.shape)
            #print (dActivation(self.act_list[layer-1], self.forward_list[layer].s ).shape)
            self.backward_list[layer].ds = self.backward_list[layer].dx * dActivation(self.act_list[layer-1], self.forward_list[layer].s ) 
            
            #dl/dw
            #print( self.forward_list[layer].x.shape)
            # x:  X of layer (l-1) -> x.size = N*d_in
            # ds: dL/ds of layer (l) -> ds.size = N*d_out
            
            arash=self.backward_list[layer].ds.unsqueeze(1)
            ehsan=self.forward_list[layer-1].x.unsqueeze(2)
            """
            print(arash.shape)
            print(ehsan.shape)
            """
            #print (self.backward_list[layer].ds.shape)
            self.backward_list[layer].dw = arash*ehsan
            #print ('said obi-wan kenobi')
            
            #dl/db
            self.backward_list[layer].db = self.backward_list[layer].ds
            
            #print(self.backward_list[layer].db)
            """
            print (self.backward_list[layer].ds.shape)
            print (self.backward_list[layer].dw.shape)
            print (self.backward_list[layer].db.shape)
            print (self.backward_list[layer].dx.shape)
            print ('hello there')
"""
        # summing all batch grads
        for layer in list (range(1,self.n_layer+1)):
            #print (layer,self.n_layer)
            
            #print (self.backward_list[layer])
            #print (self.backward_list[layer].dw)
            self.grad_list.append(layer_par(self.param_list[layer-1].dim_in,self.param_list[layer-1].dim_out) )
            
            self.grad_list[layer-1].b = self.backward_list[layer].db.sum(0)/self.N   # watchout! these are grad!
            self.grad_list[layer-1].w = self.backward_list[layer].dw.sum(0)/self.N   # watchout! these are grad!
            
        return self.grad_list


# Draft, tests, and other stuff

In [23]:
x = Tensor(7).normal_()
X = torch.cat((x.view(1,-1),x.view(1,-1)*3,x.view(1,-1)*x.view(1,-1)),0)
Y= Tensor(3,10).normal_();

In [24]:
seq = [Linear(7,2),'lin',Linear(2,5),'relu',Linear(5,10),'lin']    
model = Net(seq,X,Y)

In [25]:
win = model.backward(X,Y)

In [26]:
win[2].w



Columns 0 to 7 
   0.0000    0.0000    0.0000    0.0000    0.0000    0.0000    0.0000    0.0000
  -3.0228   -4.7768    3.9350    1.7497    1.4186   -9.8006    7.0396    4.2791
  29.4164 -276.8149  134.8746  133.1670 -105.5648   76.2135   22.0834   46.5936
   7.3171  -68.8558   33.5491   33.1244  -26.2585   18.9576    5.4931   11.5898
   0.0000    0.0000    0.0000    0.0000    0.0000    0.0000    0.0000    0.0000

Columns 8 to 9 
   0.0000    0.0000
  -0.3615    3.8822
 -56.9019  -34.9590
 -14.1540   -8.6958
   0.0000    0.0000
[torch.FloatTensor of size 5x10]

In [27]:
model.forward(X)

(
 
 Columns 0 to 7 
   1.2617  -7.9088   3.5400   3.7850  -3.1567   2.2944   0.9672   1.4872
   3.7850 -23.7265  10.6200  11.3549  -9.4701   6.8831   2.9017   4.4615
  -2.0595  -1.7307   3.2273   1.7855   0.6365  -4.1770   3.9232   2.7162
 
 Columns 8 to 9 
  -0.8759  -0.7689
  -2.6278  -2.3066
   0.6274   3.0571
 [torch.FloatTensor of size 3x10], 
 
 Columns 0 to 7 
   1.2617  -7.9088   3.5400   3.7850  -3.1567   2.2944   0.9672   1.4872
   3.7850 -23.7265  10.6200  11.3549  -9.4701   6.8831   2.9017   4.4615
  -2.0595  -1.7307   3.2273   1.7855   0.6365  -4.1770   3.9232   2.7162
 
 Columns 8 to 9 
  -0.8759  -0.7689
  -2.6278  -2.3066
   0.6274   3.0571
 [torch.FloatTensor of size 3x10])

In [28]:
b= Tensor(2,4).normal_()

In [29]:
a*b

NameError: name 'a' is not defined

In [30]:
range(1,5)

range(1, 5)

In [31]:
a= Tensor (3,1,5).normal_()
b= Tensor (3,2,1).normal_()

In [32]:
a*b


(0 ,.,.) = 
  0.1278  0.0670  0.7209 -0.0858  0.6084
  0.2392  0.1253  1.3485 -0.1605  1.1381

(1 ,.,.) = 
 -0.8837 -1.4711  1.2978  0.2068  0.1009
  0.4539  0.7555 -0.6665 -0.1062 -0.0518

(2 ,.,.) = 
 -0.0895  0.0962  0.0461  0.1517 -0.3767
 -0.1185  0.1274  0.0610  0.2010 -0.4990
[torch.FloatTensor of size 3x2x5]

In [33]:
a


(0 ,.,.) = 
 -0.4102 -0.2148 -2.3130  0.2753 -1.9521

(1 ,.,.) = 
  1.0080  1.6779 -1.4803 -0.2359 -0.1151

(2 ,.,.) = 
 -0.3803  0.4088  0.1957  0.6448 -1.6010
[torch.FloatTensor of size 3x1x5]

In [34]:
b


(0 ,.,.) = 
 -0.3117
 -0.5830

(1 ,.,.) = 
 -0.8767
  0.4503

(2 ,.,.) = 
  0.2353
  0.3117
[torch.FloatTensor of size 3x2x1]

In [35]:
[range(10)]

[range(0, 10)]

In [36]:
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [37]:
[1:10]

SyntaxError: invalid syntax (<ipython-input-37-9778031892f4>, line 1)