In [1]:
import numpy as np

In [67]:
class DeepLearning():
    def __init__(self,X,label,learning_rate,lbd=0,batch_size = None):
        np.random.seed(2)
        np.seterr(all='raise')
        self.m = X.shape[1]
        self.n = {}
        self.prev_layer = X.shape[0]
        self.weights = {}
        self.bias = {}
        self.z = {}
        self.a = {}
        self.num_layers=0
        self.fn = {}
        self.da = {}
        self.dw = {}
        self.db = {}
        self.vdw = {}
        self.vdb = {}
        self.sdw = {}
        self.sdb = {}
        #self.vdw[1] = 0
        #self.vdb[1] = 0
        #self.a[0] = X
        self.alpha0 = learning_rate
        self.alpha = learning_rate
        self.y = label
        self.lbd = lbd
        self.keep_prob = {}
        self.drop_out = {}
        self.X = X
        self.batch_size = batch_size
        self.mbeta = 1
        self.rmsbeta = 1
        
    def add_layer(self,neurons,acti_fn='sigmoid',c=1.0,keep_prob=1.0):
        if acti_fn=='relu':
            c=2.0
        self.weights[self.num_layers+1] = np.random.randn(neurons,self.prev_layer)*np.sqrt(c/self.prev_layer)
        #self.weights[self.num_layers+1] = 2*(np.random.rand(neurons,self.prev_layer))-1
        self.bias[self.num_layers+1] = np.random.random((neurons,1))
        
        self.vdw[self.num_layers+1] = np.zeros((neurons,self.prev_layer))
        self.vdb[self.num_layers+1] = np.zeros((neurons,1))
        self.sdw[self.num_layers+1] = np.zeros((neurons,self.prev_layer))
        self.sdb[self.num_layers+1] = np.zeros((neurons,1))
        
        self.num_layers+=1
        self.fn[self.num_layers] = acti_fn
        self.keep_prob[self.num_layers] = keep_prob
        self.prev_layer=neurons
        
        
    def fit(self,test):
        a_tmp = test
        for layer in range(1,self.num_layers+1):
            z_tmp = np.dot(self.weights[layer],a_tmp) + self.bias[layer]
            a_tmp = self.activation_function(z_tmp,self.fn[layer])
        return a_tmp
    
    def feed_forward(self,X):
        self.a[0] = self.X
        for layer in range(1,self.num_layers+1):
            self.z[layer] = np.dot(self.weights[layer],self.a[layer-1]) + self.bias[layer]
            self.a[layer] = self.activation_function(self.z[layer],self.fn[layer])
            if self.keep_prob[layer]!=1:
                self.drop_out[layer] = np.random.rand(self.a[layer].shape[0],self.a[layer].shape[1])<self.keep_prob[layer]
                self.a[layer] = np.multiply(self.drop_out[layer],self.a[layer])
                self.a[layer] /=self.keep_prob[layer]
            else:
                self.drop_out[layer]=1
    
    def activation_function(self,x,acti_fn):
        if acti_fn == 'tanh':
            return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
        if acti_fn == 'relu':
            return np.maximum(0.01*x,x)
        if acti_fn == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        
    def back_pass(self):
        #self.da[self.num_layers] = -1*(self.y/self.a[self.num_layers])+(1-self.y)/(1-self.a[self.num_layers])
        try:
            self.da[self.num_layers] = (self.y-self.a[self.num_layers])/(self.a[self.num_layers]**2-1) 
        except:
            self.da[self.num_layers] = (self.y-self.a[self.num_layers])/(self.a[self.num_layers]**2-1+np.power(0.10,-8)) 
        for layer in reversed(range(1,self.num_layers+1)):
            tmp = self.da[layer]*self.derivative_fn(self.a[layer],self.fn[layer])
            self.dw[layer] = (np.dot(tmp,(self.a[layer-1]).T))/self.m + (self.lbd*self.weights[layer])/(self.m+0.0)
            self.db[layer] = (np.sum(tmp,axis=1,keepdims=True))/self.m
            self.da[layer-1] = np.dot((self.weights[layer]).T,tmp)
            if layer>1:
                self.da[layer-1] = self.drop_out[layer-1]*self.da[layer-1]
                self.da[layer-1]/=self.keep_prob[layer-1]
    
    def derivative_fn(self,x,acti_fn):
        #x = self.activation_function(x,acti_fn)
        if acti_fn == 'tanh':
            return 1-x**2
        if acti_fn == 'relu':
            return np.where(x<=0,0.01,1)
        if acti_fn == 'sigmoid':
            return x * (1 - x)
    
    def gradient_descent(self):
        for layer in range(1,self.num_layers+1):
            self.weights[layer] = self.weights[layer] -self.alpha*self.dw[layer]
            self.bias[layer] = self.bias[layer] - self.alpha*self.db[layer]
    
    def momentum(self):
        for layer in range(1,self.num_layers+1):
            
            self.vdw[layer] = self.mbeta * self.vdw[layer] + (1-self.mbeta)*self.dw[layer]
            self.vdb[layer] = self.mbeta * self.vdb[layer] + (1-self.mbeta)*self.db[layer]
            
            self.weights[layer] = self.weights[layer] -self.alpha*self.vdw[layer]
            self.bias[layer] = self.bias[layer] - self.alpha*self.vdb[layer]
            
    def rms_prop(self,rms_epsilon):
        print t
        for layer in range(1,self.num_layers+1):
            
            self.sdw[layer] = self.rmsbeta * self.sdw[layer] + (1-self.rmsbeta)*(self.dw[layer]**2)
            self.sdb[layer] = self.rmsbeta * self.sdb[layer] + (1-self.rmsbeta)*(self.db[layer]**2)
            
            self.weights[layer] = self.weights[layer] - self.alpha*(self.dw[layer]/np.sqrt(self.sdw[layer] + rms_epsilon))
            self.bias[layer] = self.bias[layer] - self.alpha*(self.db[layer]/np.sqrt(self.sdb[layer] + rms_epsilon))
         
    def adam(self,rms_epsilon):
        for layer in range(1,self.num_layers+1):
            self.vdw[layer] = self.mbeta * self.vdw[layer] + (1-self.mbeta)*self.dw[layer]
            self.vdb[layer] = self.mbeta * self.vdb[layer] + (1-self.mbeta)*self.db[layer]
            
            self.sdw[layer] = self.rmsbeta * self.sdw[layer] + (1-self.rmsbeta)*(self.dw[layer]**2)
            self.sdb[layer] = self.rmsbeta * self.sdb[layer] + (1-self.rmsbeta)*(self.db[layer]**2)
           
            self.weights[layer] = self.weights[layer] - self.alpha*(self.vdw[layer]/np.sqrt(self.sdw[layer] + rms_epsilon))
            self.bias[layer] = self.bias[layer] - self.alpha*(self.vdb[layer]/np.sqrt(self.sdb[layer] + rms_epsilon))
            
    def cost_fn(self):
        activation=self.a[self.num_layers]
        reg_error = 0
        for layer in range(1,self.num_layers+1):
            reg_error += (np.linalg.norm(self.weights[layer]))**2
        reg_error = (reg_error*self.lbd)/(self.m+0.0)
        try:
            result = (-1*np.average(np.log(activation)*self.y + np.log(1-activation)*(1-self.y)))+reg_error,np.sum((activation>0.5).astype(int)==self.y)/(self.m+0.0)
        except:
            activation = np.where(activation==0,np.power(0.10,8),activation)
            activation = np.where(activation==1,1-np.power(0.10,8),activation)
            result = (-1*np.average(np.log(activation)*self.y + np.log(1-activation)*(1-self.y)))+reg_error,np.sum((activation>0.5).astype(int)==self.y)/(self.m+0.0)
        return result
    
    def train(self):
        for i in range(0, 30000):
            self.feed_forward(self.X)
            self.back_pass()
            self.gradient_descent()
            print self.cost_fn()
            if self.cost_fn()[0]<0.05:
                print i,self.cost_fn()
                break
                
    def batch_train(self,batch_size=None,optimization_algo="gradient_descent",momentum_beta=1,rmsbeta=1,rms_epsilon=np.power(0.10,-8),decay=False):
        self.mbeta = momentum_beta
        self.rmsbeta = rmsbeta
        if batch_size is None:
                batch_size = self.m
        nb = int(np.ceil((self.m+0.0)/batch_size))
        for i in range(0, 30000):
            if decay == True:
                self.alpha = np.power(0.95,i)*self.alpha0 + 0.0001
            for t in range(0,nb):
                X_b = X[:,batch_size*t:batch_size*(t+1)]
                self.feed_forward(X_b)
                self.back_pass()
                self.gradient_descent()
                if optimization_algo=='momentum':
                    self.momentum()
                print i,self.cost_fn()
                
            if self.cost_fn()[0]<0.1:
                print i,self.cost_fn()
                break

In [68]:
def mf(a):
    return (a[0]+a[1])
X = 100*(np.random.random((2,10000)))+1

In [69]:
y = ((np.apply_along_axis(mf,0,X))>110).astype(int).reshape(1,10000)

In [71]:
nn = DeepLearning(X,y,learning_rate=0.5)
nn.add_layer(3,acti_fn='relu')
nn.add_layer(7,acti_fn='relu')
nn.add_layer(5,acti_fn='relu')
nn.add_layer(9,acti_fn='relu')
nn.add_layer(6,acti_fn='relu')
nn.add_layer(9,acti_fn='relu')
nn.add_layer(5,acti_fn='relu')
nn.add_layer(7,acti_fn='relu')
nn.add_layer(3,acti_fn='relu')
nn.add_layer(1)

nn.batch_train(optimization_algo='momentum',momentum_beta=0.9,batch_size=128,decay=True)


0 (0.84594273312056989, 0.57699999999999996)
0 (2.1695507714942792, 0.42299999999999999)
0 (0.77886633966704844, 0.57699999999999996)
0 (0.70875447011278669, 0.46289999999999998)
0 (0.68911238860002166, 0.56069999999999998)
0 (0.67873211530089816, 0.57699999999999996)
0 (0.67517356452243638, 0.57699999999999996)
0 (0.66985378994076128, 0.57699999999999996)
0 (0.66098928651342381, 0.57699999999999996)
0 (0.64490254211882969, 0.57699999999999996)
0 (0.64181911597813812, 0.57699999999999996)
0 (0.66563522715214718, 0.62970000000000004)
0 (0.77223346894155287, 0.57699999999999996)
0 (0.65877721542520318, 0.63600000000000001)
0 (0.62953983195053553, 0.72099999999999997)
0 (0.64571781916853743, 0.71099999999999997)
0 (0.64731580496621943, 0.62170000000000003)
0 (0.80645181623258433, 0.65100000000000002)
0 (0.66816724468068478, 0.54110000000000003)
0 (0.63095467098527525, 0.67269999999999996)
0 (0.63432270708155081, 0.72750000000000004)
0 (0.62119255459087064, 0.70279999999999998)
0 (0.678633

2 (0.17953560025215379, 0.95569999999999999)
2 (0.26675874316220222, 0.82940000000000003)
2 (0.19099862587404304, 0.95369999999999999)
2 (0.26869174284335634, 0.81969999999999998)
2 (0.17568917132242784, 0.96809999999999996)
2 (0.19516008936673607, 0.89449999999999996)
2 (0.14725491807292207, 0.97319999999999995)
2 (0.1924936948338371, 0.87609999999999999)
2 (0.21369256145893059, 0.93369999999999997)
2 (0.69273820762646665, 0.71919999999999995)
2 (0.17084215159514671, 0.97629999999999995)
2 (0.2710143026960472, 0.91490000000000005)
2 (0.38081558869183435, 0.76839999999999997)
2 (0.18331191960906695, 0.97760000000000002)
2 (0.1579073604871076, 0.97860000000000003)
2 (0.14130721802195145, 0.97860000000000003)
2 (0.31532647194751429, 0.88919999999999999)
2 (0.39103356664180966, 0.78100000000000003)
2 (0.15084314836626997, 0.97660000000000002)
2 (0.16241993239384914, 0.94440000000000002)
2 (0.23406342950303438, 0.92000000000000004)
2 (0.47444217225841812, 0.75700000000000001)
2 (0.15311415

4 (0.13950017430121053, 0.96109999999999995)
4 (0.22603507924362967, 0.85329999999999995)
4 (0.094569484363566964, 0.98429999999999995)
4 (0.14066532514981148, 0.96140000000000003)
4 (0.20942723166175034, 0.85729999999999995)
4 (0.1007326829126847, 0.97470000000000001)
4 (0.14155788701708852, 0.96220000000000006)
4 (0.19242304863062829, 0.86360000000000003)
4 (0.10087558919554904, 0.97509999999999997)
4 (0.1375939515329134, 0.96430000000000005)
4 (0.1851402402794412, 0.86629999999999996)
4 (0.098059580540327859, 0.97899999999999998)
4 (0.1402968950407418, 0.96240000000000003)
4 (0.19608748772208923, 0.86150000000000004)
4 (0.097883512685565111, 0.97989999999999999)
4 (0.13478355970940276, 0.96530000000000005)
4 (0.17313512486659161, 0.87170000000000003)
4 (0.10033505993984437, 0.96719999999999995)
5 (0.13791837836939563, 0.96240000000000003)
5 (0.174543265810265, 0.87050000000000005)
5 (0.096714997419278137, 0.97250000000000003)
5 (0.11576742497188409, 0.97099999999999997)
5 (0.1296966