**Libraries**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from keras.datasets import mnist
import warnings
warnings.filterwarnings('ignore')

**Loading the data**

In [None]:

(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train=np.array( x_train,dtype = float )
y_train=np.array( y_train,dtype = float )
x_test=np.array( x_test,dtype = float )
y_test=np.array( y_test,dtype = float)

x_train = x_train.reshape(x_train.shape[0],-1)
x_test = x_test.reshape(x_test.shape[0],-1) 


**Select classes 0 & 1**

In [None]:
x_train = x_train[(y_train == 1) | (y_train == 0)]
y_train = y_train[(y_train == 1) | (y_train == 0)]

x_test = x_test[(y_test == 1) | (y_test == 0)]
y_test = y_test[(y_test == 1) | (y_test == 0)]

**Standardization**

In [None]:
x_train= (x_train - np.mean(x_train))/ np.std(x_train)
x_test= (x_test - np.mean(x_test))/ np.std(x_test)


**Array split implementation**

In [None]:
def split(x , k):
    fold_size = np.floor(x.shape[0] / k)
    folds = []
    for i in range(k):
        first = int(i * fold_size)
        end = int((i+1) * fold_size)
        fold = x[first:end]
        folds.append(fold)
    return folds 

**Activation funcion**

In [None]:
def sigmoid(z):
    return (1/(1+np.exp(-z)))

**Accuracy calculator**

In [None]:
def accuracy(ypred,y_test):
    y_test=y_test.reshape(len(ypred),1)
    ac = np.mean(ypred == y_test)*100

    return ac

**Predict function**

In [None]:
def predict(x_test,w,b):
  sz = len(x_test)
  z =  np.dot( w.T , x_test.T )+b
  ypred = sigmoid(z)
  ypred = ypred.T
  ypred = np.where(ypred>0.5 , 1 ,0)
  
  
  return ypred

**Logistic regression**

In [None]:
def Logistic_regression(x_train , y_train , eta , iterations=1000):
    
    np.random.seed(35)
    w = np.random.rand(x_train.shape[1],1)
    b = np.random.rand(1)
    sz = len(x_train)
    error = []
    tol = 0.0000001
    
    y_train = y_train.reshape(y_train.shape[0],1)

    for i in range(iterations):
        z = np.dot(w.T,x_train.T)+b
        phiz = sigmoid(z)
        phiz = phiz.T
        phiz = np.where(phiz>0.5 , 1 ,0)
        
        error.append(np.mean(((-y_train*np.log(phiz+tol))-(((1-y_train)*(np.log((1-phiz)+tol)))))))
    
        w = w - (eta * (np.dot((phiz - y_train).T , x_train )) / sz).T
        b = b - eta * np.mean(phiz - y_train)

        if error[i] <= tol:
              break 
    return w,b
# adding 10 Fold CV
def kfold_logistic(iterations):
    eta=[0.0001 , 0.001 , 0.01 , 0.1,0.5]
    escores=[]
    sb=0 
    wf=0
    bf=0
    for r in range(len(eta)):
        k=10
        x_folds = split(x_train,k)
        y_folds = split(y_train,k)
        scores = []

        for i in range(k):
            X_train = np.concatenate([x_folds[j] for j in range(k) if j != i])
            Y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])
          
            x_val = (x_folds[i])
            y_val = (y_folds[i])
          
            w,b = Logistic_regression(X_train,Y_train,eta[r],iterations) 
            
            sz = len(y_val)
            z =  np.dot( w.T , x_val.T )+b
            ypred = sigmoid(z)
            ypred = ypred.T
            ypred = np.where(ypred>0.5 , 1 ,0)
            y_val=y_val.reshape(sz,1)
                
            scores.append(round(accuracy(ypred,y_val),2))
        if np.mean(scores)>sb:
          sb=np.mean(scores)
          wf=w
          bf=b
          e=eta[r]
        escores.append(np.mean(scores))
    return wf,bf,e

**Logisitic regression with L1 regularization**

In [None]:
def L1_Logistic_regression(x_train , y_train , eta ,Lambda_, iterations=1000):
    
    np.random.seed(35)
    w = np.random.rand(x_train.shape[1],1)
    b = np.random.rand(1)
    sz = len(x_train)
    error = []
    tol = 0.0000001
    
    y_train = y_train.reshape(y_train.shape[0],1)

    for i in range(iterations):
        z = np.dot(w.T,x_train.T)+b
        phiz = sigmoid(z)
        phiz = phiz.T
        phiz = np.where(phiz>0.5 , 1 ,0)

        cost = (-y_train*np.log(phiz+tol)-(((1-y_train)*(np.log((1-phiz)+tol))))) + Lambda_ * np.sum(np.abs(w))
        error.append(np.mean(cost))

        w = w - (eta * (np.dot((phiz - y_train).T , x_train ) / sz).T + Lambda_ * np.sign(w))
        b = b - eta * np.mean(phiz - y_train)

        if error[i] <= tol:
              break 
    return w,b
# adding 10 Fold CV
def L1_kfold_logistic(Lambda_reg, iterations):
    eta=[0.0001 , 0.001 , 0.01 , 0.1,0.5]
    Lambda_ = Lambda_reg
    escores=[]

    sb=0 
    wf=0
    bf=0
    for r in range(len(eta)):
        k=10
        x_folds = split(x_train,k)
        y_folds = split(y_train,k)
        scores = []

        for i in range(k):
            X_train = np.concatenate([x_folds[j] for j in range(k) if j != i])
            Y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])
          
            x_val = (x_folds[i])
            y_val = (y_folds[i])
          
            w,b = L1_Logistic_regression(X_train,Y_train,eta[r],Lambda_,iterations) 
            
            sz = len(y_val)
            z =  np.dot( w.T , x_val.T )+b
            ypred = sigmoid(z)
            ypred = ypred.T
            ypred = np.where(ypred>0.5 , 1 ,0)
            y_val=y_val.reshape(sz,1)
                
            scores.append(round(accuracy(ypred,y_val),2))
            
        if np.mean(scores)>sb:
          sb=np.mean(scores)
          wf=w
          bf=b
          e=eta[r]
        escores.append(np.mean(scores))
    return wf,bf,e    

**Mini-Batch gradient descent**


In [None]:
def Mini_batch_logistic(x_train , y_train , eta , epochs=100,batch_size=100):
    
    np.random.seed(35)
    w = np.random.rand(x_train.shape[1],1)
    b = np.random.rand(1)
    sz0 = len(x_train)
    error = []
    tol = 0.0000001
    
    y_train = y_train.reshape(y_train.shape[0],1)

    for i in range(epochs):
        ranges = np.random.choice(sz0, batch_size, replace=False)
        x_mini = x_train[ranges]
        y_mini = y_train[ranges]
        sz = len(x_mini)

        z = np.dot(w.T,x_mini.T)+b
        phiz = sigmoid(z)
        phiz = phiz.T
        phiz = np.where(phiz>0.5 , 1 ,0)
        
        error.append(np.mean(((-y_mini*np.log(phiz+tol))-(((1-y_mini)*(np.log((1-phiz)+tol)))))))
    
        w = w - (eta * (np.dot((phiz - y_mini).T , x_mini )) / sz).T
        b = b - eta * np.mean(phiz - y_mini)

        if error[i] <= tol:
              break 
    return w,b
# adding 10 Fold CV
def Mini_Batch_kfold_logistic(epochs,batch_size):
    eta=[0.0001 , 0.001 , 0.01 , 0.1,0.5]
    escores=[]
    sb=0 
    wf=0
    bf=0

    for r in range(len(eta)):
        k=10
        x_folds = split(x_train,k)
        y_folds = split(y_train,k)
        scores = []

        for i in range(k):
            X_train = np.concatenate([x_folds[j] for j in range(k) if j != i])
            Y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])
          
            x_val = (x_folds[i])
            y_val = (y_folds[i])
          
            w,b = Mini_batch_logistic(X_train,Y_train,eta[r],epochs,batch_size) 
            
            sz = len(y_val)
            z =  np.dot( w.T , x_val.T )+b
            ypred = sigmoid(z)
            ypred = ypred.T
            ypred = np.where(ypred>0.5 , 1 ,0)
            y_val=y_val.reshape(sz,1)
                
            scores.append(round(accuracy(ypred,y_val),2))
        if np.mean(scores)>sb:
          sb=np.mean(scores)
          wf=w
          bf=b
          e=eta[r]
        escores.append(np.mean(scores))
    return wf,bf,e    

**RMS Prop optimizer**


In [None]:
def RMS_Prop_Logistic_regression(x_train , y_train , eta , iterations=1000):
    
    np.random.seed(35)
    w = np.random.rand(x_train.shape[1],1)
    b = np.random.rand(1)
    sz = len(x_train)
    error = []
    tol = 0.0000001

    B2 = 0.999
    eps = 0.0000001

    v_w = np.zeros_like(w)
    v_b = np.zeros_like(b)
   
    y_train = y_train.reshape(y_train.shape[0],1)
    
    for i in range(iterations):
        z = np.dot(w.T,x_train.T)+b
        phiz = sigmoid(z)
        phiz = phiz.T
        phiz = np.where(phiz>0.5 , 1 ,0)
        
        error.append(np.mean(((-y_train*np.log(phiz+tol))-(((1-y_train)*(np.log((1-phiz)+tol)))))))
         
        dw = (np.dot((phiz - y_train).T, x_train) / sz).T
        db = np.mean(phiz - y_train)

        v_w =B2 *v_w + (1 - B2)* (dw **2)
        v_b= B2 * v_b +(1 - B2) * (db** 2)

        w = w - (eta * dw) / (np.sqrt(v_w) + eps)
        b = b - (eta * db) / (np.sqrt(v_b) + eps)

        if error[i] <= tol:
              break 
    return w,b
# adding 10 Fold CV
def RMS_Prop_kfold_logistic(iterations):
    eta=[0.0001 , 0.001 , 0.01 , 0.1,0.5]
    escores=[]
    sb=0 
    wf=0
    bf=0
    for r in range(len(eta)):
        k=10
        x_folds = split(x_train,k)
        y_folds = split(y_train,k)
        scores = []

        for i in range(k):
            X_train = np.concatenate([x_folds[j] for j in range(k) if j != i])
            Y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])
          
            x_val = (x_folds[i])
            y_val = (y_folds[i])
          
            w,b = RMS_Prop_Logistic_regression(X_train,Y_train,eta[r],iterations) 
            
            sz = len(y_val)
            z =  np.dot( w.T , x_val.T )+b
            ypred = sigmoid(z)
            ypred = ypred.T
            ypred = np.where(ypred>0.5 , 1 ,0)
            y_val=y_val.reshape(sz,1)
                
            scores.append(round(accuracy(ypred,y_val),2))
        if np.mean(scores)>sb:
          sb=np.mean(scores)
          wf=w
          bf=b
          e=eta[r]
        escores.append(np.mean(scores))
    return wf,bf,e
# RMS with Mini-Batch    
def RMS_Prop_Mini_batch_logistic(x_train , y_train , eta , epochs=100,batch_size=500):
    
    np.random.seed(35)
    w = np.random.rand(x_train.shape[1],1)
    b = np.random.rand(1)
    sz0 = len(x_train)
    error = []
    tol = 0.0000001

    B2 = 0.999
    eps = 0.0000001

    v_w = np.zeros_like(w)
    v_b = np.zeros_like(b)

    y_train = y_train.reshape(y_train.shape[0],1)

    for i in range(epochs):
        ranges = np.random.choice(sz0, batch_size, replace=False)
        x_mini = x_train[ranges]
        y_mini = y_train[ranges]
        sz = len(x_mini)
        
        z = np.dot(w.T,x_mini.T)+b
        phiz = sigmoid(z)
        phiz = phiz.T
        phiz = np.where(phiz>0.5 , 1 ,0)
        
        error.append(np.mean(((-y_mini*np.log(phiz+tol))-(((1-y_mini)*(np.log((1-phiz)+tol)))))))

        dw = (np.dot((phiz - y_mini).T, x_mini) / sz).T
        db = np.mean(phiz - y_mini)

        v_w =B2 *v_w + (1 - B2)* (dw **2)
        v_b= B2 * v_b +(1 - B2) * (db** 2)

        w = w - (eta * dw) / (np.sqrt(v_w) + eps)
        b = b - (eta * db) / (np.sqrt(v_b) + eps)

        if error[i] <= tol:
              break 
    return w,b
# RMS, Mini-Bacth with kfold    
def RMS_Prop_Mini_Batch_kfold_logistic(epochs,batch_size):
    eta=[0.0001 , 0.001 , 0.01 , 0.1,0.5]
    escores=[]
    sb=0 
    wf=0
    bf=0

    for r in range(len(eta)):
        k=10
        x_folds = split(x_train,k)
        y_folds = split(y_train,k)
        scores = []

        for i in range(k):
            X_train = np.concatenate([x_folds[j] for j in range(k) if j != i])
            Y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])
          
            x_val = (x_folds[i])
            y_val = (y_folds[i])
          
            w,b = RMS_Prop_Mini_batch_logistic(X_train,Y_train,eta[r],epochs,batch_size) 
            
            sz = len(y_val)
            z =  np.dot( w.T , x_val.T )+b
            ypred = sigmoid(z)
            ypred = ypred.T
            ypred = np.where(ypred>0.5 , 1 ,0)
            y_val=y_val.reshape(sz,1)
                
            scores.append(round(accuracy(ypred,y_val),2))
        if np.mean(scores)>sb:
          sb=np.mean(scores)
          wf=w
          bf=b
          e=eta[r]
        escores.append(np.mean(scores))
    return wf,bf,e        

**ADAM optimizer**


In [None]:
def ADAM_Logistic_regression(x_train , y_train , eta , iterations=1000):
    
    np.random.seed(35)
    w = np.random.rand(x_train.shape[1],1)
    b = np.random.rand(1)
    sz = len(x_train)
    error = []
    tol = 0.0000001

    B1 = 0.9
    B2 = 0.999
    eps = 0.0000001
    
    # Momentum
    M_v_w = np.zeros_like(w)
    M_v_b = np.zeros_like(b)
    
    # RMS Prob
    R_v_w = np.zeros_like(w)
    R_v_b =np.zeros_like(b)

    y_train = y_train.reshape(y_train.shape[0],1)

    for i in range(iterations):
        z = np.dot(w.T,x_train.T)+b
        phiz = sigmoid(z)
        phiz = phiz.T
        phiz = np.where(phiz>0.5 , 1 ,0)
        
        error.append(np.mean(((-y_train*np.log(phiz+tol))-(((1-y_train)*(np.log((1-phiz)+tol)))))))
         
        dw = (np.dot((phiz - y_train).T, x_train) / sz).T
        db = np.mean(phiz - y_train)

        # Momentum
        M_v_w = B1* M_v_w +(1-B1)*dw
        M_v_b = B1* M_v_b +(1-B1)*db

        # RMS Prob
        R_v_w =B1 *R_v_w + (1 - B1)* (dw **2)
        R_v_b= B2 * R_v_b +(1 - B2) * (db** 2)
        
        # ADAM mixes Momentum with RMS Prop in the weights update equation
        w = w - eta*(M_v_w/(np.sqrt(R_v_w+eps)))
        b = b - eta*(M_v_b/(np.sqrt(R_v_b+eps)))

        if error[i] <= tol:
              break 
    return w,b

# adding 10 Fold CV
def ADAM_kfold_logistic(iterations):
    eta=[0.0001 , 0.001 , 0.01 , 0.1,0.5]
    escores=[]
    sb=0 
    wf=0
    bf=0
    for r in range(len(eta)):
        k=10
        x_folds = split(x_train,k)
        y_folds = split(y_train,k)
        scores = []

        for i in range(k):
            X_train = np.concatenate([x_folds[j] for j in range(k) if j != i])
            Y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])
          
            x_val = (x_folds[i])
            y_val = (y_folds[i])
          
            w,b = ADAM_Logistic_regression(X_train,Y_train,eta[r],iterations) 
            
            sz = len(y_val)
            z =  np.dot( w.T , x_val.T )+b
            ypred = sigmoid(z)
            ypred = ypred.T
            ypred = np.where(ypred>0.5 , 1 ,0)
            y_val=y_val.reshape(sz,1)
                
            scores.append(round(accuracy(ypred,y_val),2))
        if np.mean(scores)>sb:
          sb=np.mean(scores)
          wf=w
          bf=b
          e=eta[r]
        escores.append(np.mean(scores))
    return wf,bf,e
            

**Running L1 regularisation with 2 lambdas**





In [None]:
lambda_1=0.001 
lambda_2=1

w1,b1, e1 = L1_kfold_logistic(Lambda_reg= lambda_1 ,iterations = 100)
w2,b2, e2 = L1_kfold_logistic(Lambda_reg= lambda_2 ,iterations = 100)

ypred = predict(x_test,w1,b1)
print("Accuracy with lambda_1 :",str(np.round(accuracy(ypred,y_test),2))+"%")

ypred = predict(x_test,w2,b2)
print("Accuracy with lambda_2 :",str(np.round(accuracy(ypred,y_test),2))+"%")

Accuracy with lambda_1 : 99.86%
Accuracy with lambda_2 : 53.66%


**Running Mini-Batch logistic**

In [None]:
size1= 100
size2= 500
size3= 1000
size4= 2000
size5= 4000


w1,b1,e1 = Mini_Batch_kfold_logistic(epochs = 100, batch_size = size1)# 5 sec
w2,b2,e2 = Mini_Batch_kfold_logistic(epochs = 100, batch_size = size2)# 10 sec
w3,b3,e3 = Mini_Batch_kfold_logistic(epochs = 100, batch_size = size3)# 15 sec
w4,b4,e4 = Mini_Batch_kfold_logistic(epochs = 100, batch_size = size4)# 25 sec
w5,b5,e5 = Mini_Batch_kfold_logistic(epochs = 100, batch_size = size5)# 50 sec


ypred = predict(x_test,w1,b1)
print("Accuracy with size=100 :",str(np.round(accuracy(ypred,y_test),2))+"%","in time 5 sec")

ypred = predict(x_test,w2,b2)
print("Accuracy with size=500 :",str(np.round(accuracy(ypred,y_test),2))+"%","in time 10 sec")

ypred = predict(x_test,w3,b3)
print("Accuracy with size=1000 :",str(np.round(accuracy(ypred,y_test),2))+"%","in time 15 sec")

ypred = predict(x_test,w4,b4)
print("Accuracy with size=2000 :",str(np.round(accuracy(ypred,y_test),2))+"%","in time 25 sec")

ypred = predict(x_test,w5,b5)
print("Accuracy with size=4000 :",str(np.round(accuracy(ypred,y_test),2))+"%","in time 50 sec")

Accuracy with size=100 : 98.77% in time 5 sec
Accuracy with size=500 : 99.24% in time 10 sec
Accuracy with size=1000 : 99.81% in time 15 sec
Accuracy with size=2000 : 99.81% in time 25 sec
Accuracy with size=4000 : 99.76% in time 50 sec


**Running RMS Prop**

In [None]:
#whole gradient descent without optimizers
w1,b1,e1 = kfold_logistic(iterations = 100)

#whole gradient with RMS Prop
w2,b2,e2 = RMS_Prop_kfold_logistic(iterations = 100)

#Mini-Batch gradient descent with RMS Prop GET HIGHER ACCURACY in LESS TIME 
w3,b3,e3 = RMS_Prop_Mini_Batch_kfold_logistic(epochs = 100,batch_size = 1000)

ypred = predict(x_test,w1,b1)
print("Accuracy WITHOUT RMS Prop :",str(np.round(accuracy(ypred,y_test),2))+"%")

ypred = predict(x_test,w2,b2)
print("Accuracy whole batch WITH RMS Prop :",str(np.round(accuracy(ypred,y_test),2))+"%")

ypred = predict(x_test,w3,b3)
print("Accuracy Mini-Batch WITH RMS Prop :",str(np.round(accuracy(ypred,y_test),2))+"%","HIGHER ACCURACY IN LESS TIME")

Accuracy WITHOUT RMS Prop : 99.81%
Accuracy whole batch WITH RMS Prop : 99.95%
Accuracy Mini-Batch WITH RMS Prop : 99.95%


**Running ADAM**

In [None]:
#whole gradient descent without optimizers
w1,b1,e1 = kfold_logistic(iterations = 100)

#whole gradient with ADAM
w2,b2,e2 = ADAM_kfold_logistic(iterations = 100)


ypred = predict(x_test,w1,b1)
print("Accuracy WITHOUT ADAM :",str(np.round(accuracy(ypred,y_test),2))+"%")

ypred = predict(x_test,w2,b2)
print("Accuracy WITH ADAM :",str(np.round(accuracy(ypred,y_test),2))+"%")


Accuracy WITHOUT ADAM : 99.81%
Accuracy WITH ADAM : 99.95%


**Final Report:**
  **L1:**
     We can notice the effect of the L1 reg which aims to minimize the accuracy as to get rid of the overfitting problem 
     first lambda was too small, so it worked inversely and increased the dependency of the features and increased the accuracy.
     meanwhile second lambda did the L1 job and decreased the dependency on some features so the accuracy decreased.

  **Mini-Batch:**
     We can see the difference in executing time between diferrent batch sizes and the difference of the final accuracy we get
     for conclusion-> the higher batch size the higher accuracy we get and higher executing time needed so we can manage between them.


  **RMS:**
     We notice the big difference in accuracy specially with Mini-Batch gradient descent.

  **ADAM:**
     As expected ADAM is getting a higher accuracy all past methods got.  
     
