In [1]:
import numpy as np

In [58]:
class DeepLearning():
    def __init__(self,X,label,learning_rate,lbd=0,batch_size = None):
        np.random.seed(2)
        np.seterr(all='raise')
        self.m = X.shape[1]
        self.n = {}
        self.prev_layer = X.shape[0]
        self.weights = {}
        self.bias = {}
        self.z = {}
        self.a = {}
        self.num_layers=0
        self.fn = {}
        self.da = {}
        self.dw = {}
        self.db = {}
        self.vdw = {}
        self.vdb = {}
        self.sdw = {}
        self.sdb = {}
        #self.vdw[1] = 0
        #self.vdb[1] = 0
        #self.a[0] = X
        self.alpha = learning_rate
        self.y = label
        self.lbd = lbd
        self.keep_prob = {}
        self.drop_out = {}
        self.X = X
        self.batch_size = batch_size
        self.mbeta = 1
        self.rmsbeta = 1
        
    def add_layer(self,neurons,acti_fn='sigmoid',c=1.0,keep_prob=1.0):
        if acti_fn=='relu':
            c=2.0
        self.weights[self.num_layers+1] = np.random.randn(neurons,self.prev_layer)*np.sqrt(c/self.prev_layer)
        #self.weights[self.num_layers+1] = 2*(np.random.rand(neurons,self.prev_layer))-1
        self.bias[self.num_layers+1] = np.random.random((neurons,1))
        
        self.vdw[self.num_layers+1] = np.zeros((neurons,self.prev_layer))
        self.vdb[self.num_layers+1] = np.zeros((neurons,1))
        self.sdw[self.num_layers+1] = np.zeros((neurons,self.prev_layer))
        self.sdb[self.num_layers+1] = np.zeros((neurons,1))
        
        self.num_layers+=1
        self.fn[self.num_layers] = acti_fn
        self.keep_prob[self.num_layers] = keep_prob
        self.prev_layer=neurons
        
        
    def fit(self,test):
        a_tmp = test
        for layer in range(1,self.num_layers+1):
            z_tmp = np.dot(self.weights[layer],a_tmp) + self.bias[layer]
            a_tmp = self.activation_function(z_tmp,self.fn[layer])
        return a_tmp
    
    def feed_forward(self,X):
        self.a[0] = self.X
        for layer in range(1,self.num_layers+1):
            self.z[layer] = np.dot(self.weights[layer],self.a[layer-1]) + self.bias[layer]
            self.a[layer] = self.activation_function(self.z[layer],self.fn[layer])
            if self.keep_prob[layer]!=1:
                self.drop_out[layer] = np.random.rand(self.a[layer].shape[0],self.a[layer].shape[1])<self.keep_prob[layer]
                self.a[layer] = np.multiply(self.drop_out[layer],self.a[layer])
                self.a[layer] /=self.keep_prob[layer]
            else:
                self.drop_out[layer]=1
    
    def activation_function(self,x,acti_fn):
        if acti_fn == 'tanh':
            return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
        if acti_fn == 'relu':
            return np.maximum(0.01*x,x)
        if acti_fn == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        
    def back_pass(self):
        #self.da[self.num_layers] = -1*(self.y/self.a[self.num_layers])+(1-self.y)/(1-self.a[self.num_layers])
        try:
            self.da[self.num_layers] = (self.y-self.a[self.num_layers])/(self.a[self.num_layers]**2-1) 
        except:
            self.da[self.num_layers] = (self.y-self.a[self.num_layers])/(self.a[self.num_layers]**2-1+np.power(0.10,-8)) 
        for layer in reversed(range(1,self.num_layers+1)):
            tmp = self.da[layer]*self.derivative_fn(self.a[layer],self.fn[layer])
            self.dw[layer] = (np.dot(tmp,(self.a[layer-1]).T))/self.m + (self.lbd*self.weights[layer])/(self.m+0.0)
            self.db[layer] = (np.sum(tmp,axis=1,keepdims=True))/self.m
            self.da[layer-1] = np.dot((self.weights[layer]).T,tmp)
            if layer>1:
                self.da[layer-1] = self.drop_out[layer-1]*self.da[layer-1]
                self.da[layer-1]/=self.keep_prob[layer-1]
    
    def derivative_fn(self,x,acti_fn):
        #x = self.activation_function(x,acti_fn)
        if acti_fn == 'tanh':
            return 1-x**2
        if acti_fn == 'relu':
            return np.where(x<=0,0.01,1)
        if acti_fn == 'sigmoid':
            return x * (1 - x)
    
    def gradient_descent(self):
        for layer in range(1,self.num_layers+1):
            self.weights[layer] = self.weights[layer] -self.alpha*self.dw[layer]
            self.bias[layer] = self.bias[layer] - self.alpha*self.db[layer]
    
    def momentum(self):
        for layer in range(1,self.num_layers+1):
            
            self.vdw[layer] = self.mbeta * self.vdw[layer] + (1-self.mbeta)*self.dw[layer]
            self.vdb[layer] = self.mbeta * self.vdb[layer] + (1-self.mbeta)*self.db[layer]
            
            self.weights[layer] = self.weights[layer] -self.alpha*self.vdw[layer]
            self.bias[layer] = self.bias[layer] - self.alpha*self.vdb[layer]
            
    def rms_prop(self,rms_epsilon):
        print t
        for layer in range(1,self.num_layers+1):
            
            self.sdw[layer] = self.rmsbeta * self.sdw[layer] + (1-self.rmsbeta)*(self.dw[layer]**2)
            self.sdb[layer] = self.rmsbeta * self.sdb[layer] + (1-self.rmsbeta)*(self.db[layer]**2)
            
            self.weights[layer] = self.weights[layer] - self.alpha*(self.dw[layer]/np.sqrt(self.sdw[layer] + rms_epsilon))
            self.bias[layer] = self.bias[layer] - self.alpha*(self.db[layer]/np.sqrt(self.sdb[layer] + rms_epsilon))
         
    def adam(self,rms_epsilon):
        for layer in range(1,self.num_layers+1):
            self.vdw[layer] = self.mbeta * self.vdw[layer] + (1-self.mbeta)*self.dw[layer]
            self.vdb[layer] = self.mbeta * self.vdb[layer] + (1-self.mbeta)*self.db[layer]
            
            self.sdw[layer] = self.rmsbeta * self.sdw[layer] + (1-self.rmsbeta)*(self.dw[layer]**2)
            self.sdb[layer] = self.rmsbeta * self.sdb[layer] + (1-self.rmsbeta)*(self.db[layer]**2)
           
            self.weights[layer] = self.weights[layer] - self.alpha*(self.vdw[layer]/np.sqrt(self.sdw[layer] + rms_epsilon))
            self.bias[layer] = self.bias[layer] - self.alpha*(self.vdb[layer]/np.sqrt(self.sdb[layer] + rms_epsilon))
            
    def cost_fn(self):
        activation=self.a[self.num_layers]
        reg_error = 0
        for layer in range(1,self.num_layers+1):
            reg_error += (np.linalg.norm(self.weights[layer]))**2
        reg_error = (reg_error*self.lbd)/(self.m+0.0)
        try:
            result = (-1*np.average(np.log(activation)*self.y + np.log(1-activation)*(1-self.y)))+reg_error,np.sum((activation>0.5).astype(int)==self.y)/(self.m+0.0)
        except:
            activation = np.where(activation==0,np.power(0.10,8),activation)
            activation = np.where(activation==1,1-np.power(0.10,8),activation)
            result = (-1*np.average(np.log(activation)*self.y + np.log(1-activation)*(1-self.y)))+reg_error,np.sum((activation>0.5).astype(int)==self.y)/(self.m+0.0)
        return result
    
    def train(self):
        for i in range(0, 30000):
            self.feed_forward(self.X)
            self.back_pass()
            self.gradient_descent()
            print self.cost_fn()
            if self.cost_fn()[0]<0.05:
                print i,self.cost_fn()
                break
                
    def batch_train(self,batch_size=None,optimization_algo="gradient_descent",momentum_beta=1,rmsbeta=1,rms_epsilon=np.power(0.10,-8)):
        self.mbeta = momentum_beta
        self.rmsbeta = rmsbeta
        if batch_size is None:
                batch_size = self.m
        nb = int(np.ceil((self.m+0.0)/batch_size))
        for i in range(0, 30000):
            for t in range(0,nb):
                X_b = X[:,batch_size*t:batch_size*(t+1)]
                self.feed_forward(X_b)
                self.back_pass()
                self.gradient_descent()
                if optimization_algo=='momentum':
                    self.momentum()
                print i,self.cost_fn()
                
            if self.cost_fn()[0]<0.1:
                print i,self.cost_fn()
                break

In [59]:
def mf(a):
    return (a[0]+a[1])
X = 100*(np.random.random((2,10000)))+1

In [60]:
y = ((np.apply_along_axis(mf,0,X))>110).astype(int).reshape(1,10000)

In [64]:
nn = DeepLearning(X,y,learning_rate=0.5)
nn.add_layer(3,acti_fn='relu')
nn.add_layer(7,acti_fn='relu')
nn.add_layer(5,acti_fn='relu')
nn.add_layer(9,acti_fn='relu')
nn.add_layer(6,acti_fn='relu')
nn.add_layer(9,acti_fn='relu')
nn.add_layer(5,acti_fn='relu')
nn.add_layer(7,acti_fn='relu')
nn.add_layer(3,acti_fn='relu')
nn.add_layer(1)

nn.batch_train(optimization_algo='momentum',momentum_beta=0.9,batch_size=128)


0 (0.84594273312056989, 0.57699999999999996)
0 (2.1693694354001689, 0.42299999999999999)
0 (0.77873567198999083, 0.57699999999999996)
0 (0.70869780557336548, 0.46360000000000001)
0 (0.68894490641027661, 0.56310000000000004)
0 (0.6786543134583739, 0.57699999999999996)
0 (0.67503510843121473, 0.57699999999999996)
0 (0.66966069702725206, 0.57699999999999996)
0 (0.66061357028554191, 0.57699999999999996)
0 (0.6441496199892397, 0.57699999999999996)
0 (0.64091028864055921, 0.57699999999999996)
0 (0.66860790326467145, 0.61439999999999995)
0 (0.75193498053163699, 0.57699999999999996)
0 (0.65802879119502966, 0.65359999999999996)
0 (0.632045825930584, 0.71760000000000002)
0 (0.63765781092321006, 0.71779999999999999)
0 (0.64219743641715066, 0.61990000000000001)
0 (0.87931061737504224, 0.57699999999999996)
0 (0.67111600965102569, 0.54559999999999997)
0 (0.65976305251289036, 0.73899999999999999)
0 (0.6026923634391721, 0.755)
0 (0.57493843771735287, 0.75439999999999996)
0 (0.55885395078556788, 0.7641

2 (0.68622806116560664, 0.72989999999999999)
2 (0.16982204810279855, 0.97130000000000005)
2 (0.14275268694915638, 0.97709999999999997)
2 (0.20198857296847858, 0.93600000000000005)
2 (0.62643450625437958, 0.73540000000000005)
2 (0.16183107660630822, 0.97340000000000004)
2 (0.13681580558132667, 0.97909999999999997)
2 (0.19648138155673173, 0.93759999999999999)
2 (0.590981835089471, 0.73950000000000005)
2 (0.15635618393033732, 0.97660000000000002)
2 (0.15939164482808382, 0.96130000000000004)
2 (0.32181882846383669, 0.82210000000000005)
2 (0.20287443871431685, 0.94499999999999995)
2 (0.43361374866546537, 0.76270000000000004)
2 (0.18504017414125934, 0.97629999999999995)
2 (0.1711964896542299, 0.96399999999999997)
2 (0.26128001274490309, 0.85019999999999996)
2 (0.18305714646716978, 0.95209999999999995)
2 (0.41359721851942538, 0.76929999999999998)
2 (0.18505295350443543, 0.97270000000000001)
2 (0.20136737136541416, 0.90039999999999998)
2 (0.17208350767869898, 0.96099999999999997)
2 (0.35414645

4 (0.12465203609076723, 0.95689999999999997)
4 (0.3091522564271485, 0.83069999999999999)
4 (0.10665296429772873, 0.97909999999999997)
4 (0.11123458886093324, 0.96350000000000002)
4 (0.17288618695712374, 0.88290000000000002)
4 (0.1050792033039307, 0.96540000000000004)
4 (0.11782589931301753, 0.91300000000000003)
4 (0.11577337930908929, 0.95809999999999995)
4 (0.28093241201034996, 0.83850000000000002)
4 (0.10222889757965499, 0.97560000000000002)
4 (0.10108357177146726, 0.96689999999999998)
4 (0.14529285240727155, 0.89439999999999997)
4 (0.10010025437575384, 0.96440000000000003)
4 (0.1364371567230675, 0.89700000000000002)
4 (0.1090383013909554, 0.95689999999999997)
4 (0.200442195142919, 0.8659)
5 (0.10611475942936277, 0.95720000000000005)
5 (0.10112600881601069, 0.92800000000000005)
5 (0.088234207272271756, 0.96819999999999995)
5 (0.082962065941545982, 0.94589999999999996)
5 (0.10009636496216896, 0.96389999999999998)
5 (0.37673409370441513, 0.82310000000000005)
5 (0.081734876617256105, 0.