In [None]:
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
# flatten the training data
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# Choose 0 , 1 classes

In [None]:
xx = []
yy = []
xx_test = []
yy_test = []
xx = x_train[np.any([y_train == 0,y_train == 1], axis = 0)]
yy = y_train[np.any([y_train == 0,y_train == 1], axis = 0)]
xx_test = x_test[np.any([y_test == 0,y_test == 1], axis = 0)]
yy_test = y_test[np.any([y_test == 0,y_test == 1], axis = 0)]
x_train = np.array(xx)
y_train = np.array(yy)
x_test = np.array(xx_test)
y_test = np.array(yy_test)
print(x_test.shape, y_test.shape,x_train.shape,y_train.shape)

(2115, 784) (2115,) (12665, 784) (12665,)


# Standarization

In [None]:
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [None]:
print(x_test.shape)
print(x_val.shape)
print(y_test.shape)
print(y_val.shape)

(1057, 784)
(1058, 784)
(1057,)
(1058,)


In [None]:
eps= 1e-8
x_train = (x_train - np.mean(x_train,axis=0)) / (np.std(x_train,axis=0)+eps)
x_test= (x_test- np.mean(x_train,axis=0)) / (np.std(x_train,axis=0)+eps)
x_val = (x_val - np.mean(x_train,axis=0)) / (np.std(x_train,axis=0)+eps)

# Normalization

In [None]:
def sigmoid(z):
    z = np.clip(z, -500, 500)
    return (1 / (1 + (np.exp(-z))))

In [None]:
def calculate_cost(y, y_pred):
    y_pred = np.clip(y_pred, 1e-8, 1 - 1e-8) # clip y_pred to avoid dividing by zero
    return np.mean((-y * np.log(y_pred)) - ((1 - y) * np.log(1 - y_pred)))

In [None]:
def calculate_cost_L1(y, y_pred, w, lambd):
    y_pred = np.clip(y_pred, 1e-8, 1 - 1e-8) # clip y_pred to avoid dividing by zero
    return np.mean((-y * np.log(y_pred)) - ((1 - y) * np.log(1 - y_pred))) + (lambd * np.sum(np.abs(w)))

In [None]:
def Logistic_Regression_L1(w, b, x, y, l, iters,lambd):
    n = x.shape[0]
    for i in range(iters):
        z = np.dot(x, w) + b
        phiz = sigmoid(z)
        #cost = calculate_cost(y, phiz)
        cost = calculate_cost_L1(y, phiz, w, lambd)
        #dw = np.dot((phiz - y).T, x) / n
        dw = (np.dot(x.T, (phiz - y)) / n) + (lambd * np.sign(w))
        db = np.mean((phiz - y))
        w = w - l * dw
        b = b - l * db
    return w, b

In [None]:
def cross_validation(w, b, x, y, k, l, iters):
    n = x.shape[0]
    fold_size = int(n / k)
    acc = []
    for i in (l):
        for f in range(k):
            st = f * k
            en = st + fold_size
            test_idx = range(st, en)
            cannot = set(test_idx)
            train_idx = [j for j in range(n) if j not in cannot]
            cur_x_train = x[train_idx]
            cur_y_train = y[train_idx]
            cur_x_test = x[test_idx]
            cur_y_test = y[test_idx]
            nw_w, nw_b = Logistic_Regression_L1(w, b, cur_x_train, cur_y_train, i, iters)
            # print(nw_w , nw_b)
            cur_acc = get_accuracy(nw_w, nw_b, cur_x_test, cur_y_test)
            acc.append(cur_acc)
            print(f"Fold {f + 1}, eta = {i : .3f} , validation accuracy = {cur_acc * 100: .3f} %")
        print('-------------------------------------------------------------')
    return np.array(acc)

In [None]:
def get_accuracy(w, b, x, y):
    y_pred = sigmoid(np.dot(x, w.T) + b)
    y_pred = np.round(y_pred)
    ret = np.sum(y_pred == y) / int(y.size)
    return ret

# Testing L1 regularization when λ = [0.01 , 0.9]

In [None]:
# Train the model with lambda = 0.01
init_w = np.zeros(x_train.shape[1])
init_b = 0
final_w1, final_b1 = Logistic_Regression_L1(init_w, init_b, x_train, y_train, 0.1, 1000,0.01)
# Train the model with lambda = 0.9
final_w2, final_b2 = Logistic_Regression_L1(init_w, init_b, x_train, y_train, 0.1, 1000,0.1)
final_accuracy1 = get_accuracy(final_w1, final_b1, x_val, y_val)
final_accuracy2 = get_accuracy(final_w2, final_b2, x_val, y_val)
print('final_accuracy =', 100 * final_accuracy1, '%')
print('final_accuracy =', 100 * final_accuracy2, '%')

final_accuracy = 99.7164461247637 %
final_accuracy = 99.52741020793951 %


In [None]:
def Logistic_Regression_mini_batch(w, b, x, y, l, iters, batch_size):
    n = x.shape[0]
    num_batches = n // batch_size
    v_dw = 0
    v_db = 0
    for i in range(iters):
        shuffled_indices = np.random.permutation(n)
        x_shuffled = x[shuffled_indices]
        y_shuffled = y[shuffled_indices]
        for batch in range(num_batches):
            batch_start = batch * batch_size
            batch_end = batch_start + batch_size
            x_batch = x_shuffled[batch_start:batch_end]
            y_batch = y_shuffled[batch_start:batch_end]
            z = np.dot(x_batch, w.T) + b
            phiz = sigmoid(z)
            cost = calculate_cost(y_batch, phiz)
            v_dw = 0.9 * (np.dot((phiz - y_batch).T, x_batch) / batch_size) + 0.1 * v_dw
            w -= l * v_dw / (0.1 ** (i + 1))
            v_db = 0.9 * np.sum((phiz - y_batch)) / batch_size + 0.1 * v_db
            b -= l * v_db / (0.1 ** (i + 1))
            # dw = np.dot((phiz - y_batch).T, x_batch) / batch_size
            # db = np.mean(np.sum(phiz - y_batch))
            # w = w.T - l * dw
            # b = b - l * db
            # if u need cost just return it
    return w, b

# Use the mini batch sizes ( 8 , 500 )

In [None]:
init_w = np.random.randn(x_train.shape[1])
init_b = 0

# Train the model with batch size = 8
final_w1, final_b1 = Logistic_Regression_mini_batch(init_w, init_b, x_train, y_train, 0.1, 10,8)
# Train the model with batch size = 6
final_w2, final_b2 = Logistic_Regression_mini_batch(init_w, init_b, x_train, y_train, 0.03, 10,6)

final_accuracy1 = get_accuracy(final_w1 ,final_b1 , x_val , y_val)
final_accuracy2 = get_accuracy(final_w2, final_b2, x_val, y_val)

print('final_accuracy =', 100 * final_accuracy1, '%')
print('final_accuracy =', 100 * final_accuracy2, '%')

final_accuracy = 99.8109640831758 %
final_accuracy = 99.8109640831758 %


# RMS Prob optimizer



In [None]:
def RMS(x, y, w, b, l, iters, batch_size):
    m_samples = x.shape[0]
    # cost = []
    num_batches = m_samples // batch_size
    v_dw = 0
    v_db = 0
    EPS = 1e-8
    for i in range(iters):

        shuffled_indices = np.random.permutation(m_samples)
        x_shuffled = x[shuffled_indices]
        y_shuffled = y[shuffled_indices]

        for batch in range(num_batches):
            start_batch = batch * batch_size
            end_batch = start_batch + batch_size
            x_batch = x_shuffled[start_batch:end_batch]
            y_batch = y_shuffled[start_batch:end_batch]
            z = np.dot(x_batch, w.T) + b
            phiz = sigmoid(z)
            # cost.append(calculate_cost(y_batch, phiz))

            dw = (np.dot((phiz - y_batch).T, x_batch) / batch_size)
            db = np.sum((phiz - y_batch)) / batch_size

            v_dw = (0.9 * v_dw + 0.1 * dw ** 2)
            v_db = (0.9 * v_db + 0.1 * db ** 2)

            w -= l * dw / (np.sqrt(v_dw) + EPS)
            b -= l * db / (np.sqrt(v_db) + EPS)
            # if u need cost just return it
    return w, b

In [None]:
init_w = np.random.randn(x_train.shape[1])
init_b = 0
rmsW, rmsB = RMS(x_train, y_train, init_w, init_b, 0.1, 10, 500)
final_accuracy_rms1 = get_accuracy(rmsW, rmsB, x_val, y_val)
print('final_accuracy =', 100 * final_accuracy_rms1, '%')

final_accuracy = 99.9054820415879 %


# Adam optimizer

In [None]:
def adam(x, y, w, b, l, iters, batch_size):
    n = x.shape[0]
    cost = []
    num_batches = n // batch_size
    EPS = 1e-8
    s_dw = 0
    s_db = 0
    v_dw = 0
    v_db = 0
    for i in range(iters):

        shuffled_indices = np.random.permutation(n)
        x_shuffled = x[shuffled_indices]
        y_shuffled = y[shuffled_indices]

        for batch in range(num_batches):
            start_batch = batch * batch_size
            end_batch = start_batch + batch_size
            x_batch = x_shuffled[start_batch:end_batch]
            y_batch = y_shuffled[start_batch:end_batch]
            z = np.dot(x_batch, w.T) + b
            phiz = sigmoid(z)
            cost.append(calculate_cost(y_batch, phiz))

            w_dw = (np.dot((phiz - y_batch).T, x_batch) / batch_size)
            b_db = np.sum((phiz - y_batch)) / batch_size

            s_dw = (0.9 * s_dw + 0.1 * w_dw ** 2) 
            s_db = (0.9 * s_db + 0.1 * b_db ** 2) 

            v_dw = (0.9 * v_dw + 0.1 * w_dw ) 
            v_db = (0.9 * v_db + 0.1 * b_db ) 

            w -= l * v_dw / (np.sqrt(s_dw) + EPS)
            b -= l * v_db / (np.sqrt(s_db) + EPS)
            # if u need cost just return it
    return w, b

In [None]:
init_w = np.random.randn(x_train.shape[1])
init_b = 0
adamW, adamB = adam(x_train, y_train, init_w, init_b, 0.1, 10, 500)
print(adamW , adamB)
final_accuracy_adam1 = get_accuracy(adamW, adamB, x_val, y_val)
print('final_accuracy =', 100 * final_accuracy_adam1, '%')

[-2.46014041e-01  2.60001434e-01 -1.10143125e-01 -4.25101172e-01
 -3.37481118e-01  1.81399693e-01  6.63146616e-01  9.27740944e-02
 -1.28246208e+00 -6.88131138e-01  6.11224962e-01 -5.34808728e-01
  9.41440011e-01 -9.04019025e-01 -1.38925870e+00 -2.14955384e-02
 -6.60785401e-01 -5.67534260e-01  1.14483837e+00  6.82184577e-01
 -9.45086295e-01 -6.42109327e-01 -1.29517684e-01 -4.12709718e-01
  1.81407216e+00 -3.38653577e-01  7.70852003e-01  1.32327904e-01
 -2.06543517e-01  1.09230450e+00  6.88631747e-01 -3.73859126e-01
 -5.91675405e-01  7.96784771e-01  1.27565364e+00  8.09750988e-01
  1.38946536e+00 -1.10782922e-01 -1.50860563e-01  1.24321744e+00
  7.94444464e-01 -1.20673679e+00  6.96103613e+00  2.38584094e+00
  3.00849390e-01  2.50151199e-01 -1.61390787e+00  5.72936764e-01
  8.84733037e-02  6.26255622e-03 -5.49076503e-01 -4.38624892e-01
 -2.98285584e-02 -5.78128189e-01 -1.65214968e+00 -1.05126339e-01
  2.38616902e-01 -6.15553553e-01  6.76778013e-01  1.99960307e-01
  3.28300689e+00 -5.68650

Conclusion


1. In L1 regularization ( lasso ) we see it is prevent overfitting and improve the generalization performance of a model and the weights of the model to be small by adding lambda values to the weights and this will increase the accuracy(preformance)


2. Mini batch (momentum) use this teqnique to improve the generalization performance of a model by reducing overfitting and deacrease the iterations and help the algorithm escape from the global minima . Smaller batch sizes can help to reduce overfittes.


3. RMS will make the greidients more smooth iterations.Get risk of hyper parameter setting (efficency in iterations).It depend on the previous data point

4. adam use mex of RMS and Momentum optimizers. compines the both optimizers and make the performance efficient

After generating diffrent hyper parameters we see that adam optimizer is more efficent and faster.