In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import math
import pickle

In [2]:
def unpickle(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data


def load_cifar10_data():
    data = []
    labels = []
    for batch in range(1, 6):
        batch_data = unpickle(f'cifar-10-batches-py/data_batch_{batch}')
        data.append(batch_data[b'data'])
        labels.extend(batch_data[b'labels'])

    data = np.concatenate(data, axis=0)
    data = data.reshape(-1, 3, 32, 32)  # Reshape to (num_samples, channels, height, width)
    data = np.transpose(data, (0, 2, 3, 1))  # Transpose to (num_samples, height, width, channels)

    test_batch = unpickle('cifar-10-batches-py/test_batch')
    test_data = test_batch[b'data']
    test_labels = test_batch[b'labels']

    test_data = test_data.reshape(-1, 3, 32, 32)
    test_data = np.transpose(test_data, (0, 2, 3, 1))

    # Perform one-hot encoding for labels
    num_classes = 10
    labels = np.eye(num_classes)[labels]
    test_labels = np.eye(num_classes)[test_labels]

    return data, labels, test_data, test_labels

# Load CIFAR-10 data
x_train, y_train, x_test, y_test = load_cifar10_data()


In [3]:
print("Input data shape = {}".format(x_train.shape))

Input data shape = (50000, 32, 32, 3)


In [4]:
# Assuming x_train and y_train are your CIFAR-10 training data
x = x_train.reshape(50000, -1).T
y = y_train.reshape(50000, -1).T

In [5]:
print("Input data shape = {}".format(x.shape))

Input data shape = (3072, 50000)


In [15]:
# Assuming x_train and y_train are your CIFAR-10 training data
x = x_train.reshape(x_train.shape[0], -1).T
y = y_train.reshape(y_train.shape[0], -1).T

In [18]:

# Helper Functions...

def f(x, w, b):
    '''Sigmoid Function'''
    f = 1 / (1 + np.exp(-(np.dot(w.T, x) + b)))
    return f

def mse(x, y, w, b):
    '''Mean Squared Loss Function'''
    L = 0.5 * np.mean((y - f(x, w, b)) ** 2)
    return L

def cross_entropy(x, y, w, b):
    '''Cross Entropy Loss Function'''
    L = -np.mean(y * np.log(f(x, w, b)) + (1 - y) * np.log(1 - f(x, w, b)))
    return L

def grad_w_mse(x, y, w, b):
    fx = f(x, w, b)
    dw = np.dot(x, (fx - y).T) / x.shape[1]
    return dw

def grad_b_mse(x, y, w, b):
    fx = f(x, w, b)
    db = np.mean(fx - y)
    return db

def grad_w_cross(x, y, w, b):
    fx = f(x, w, b)
    dw = np.dot(x, (fx - y).T) / x.shape[1]
    return dw

def grad_b_cross(x, y, w, b):
    fx = f(x, w, b)
    db = np.mean(fx - y)
    return db

In [24]:

# Mini-Batch Adam...

def MiniBatchAdam(x, y, epochs, batch_size, loss, lr):
    w = np.random.randn(x.shape[0], 1)
    b = np.zeros((1, 1))
    epsilon = 1e-8
    beta1 = 0.9
    beta2 = 0.999
    momentum_w, momentum_b = 0, 0
    update_w, update_b = 0, 0
    l_list = []
    w_list = []
    b_list = []
    ep = [i for i in range(epochs + 1)]

    
    num_samples = x.shape[1]
    num_batches = x.shape[1] // batch_size
    if x.shape[1] % batch_size != 0:
        num_batches += 1

    # Shuffle the data
    idx = np.random.permutation(num_samples)
    x = x[:, idx]
    y = y[:, idx]

    selected_samples = int(num_samples * 0.15)
    x = x[:, :selected_samples]
    y = y[:, :selected_samples]

    for i in range(epochs + 1):
        for j in range(num_batches):
            start_idx = j * batch_size
            end_idx = (j + 1) * batch_size

            x_batch = x[:, start_idx:end_idx]
            y_batch = y[:, start_idx:end_idx]

            dw, db = 0, 0
            if loss == 'mse':
                dw += grad_w_mse(x_batch, y_batch, w, b)
                db += grad_b_mse(x_batch, y_batch, w, b)
            elif loss == 'cross_entropy':
                dw += grad_w_cross(x_batch, y_batch, w, b)
                db += grad_b_cross(x_batch, y_batch, w, b)

            # Momentum
            momentum_w = beta1 * momentum_w + (1 - beta1) * dw
            momentum_b = beta1 * momentum_b + (1 - beta1) * db
            # Update History
            update_w = beta2 * update_w + (1 - beta2) * dw ** 2
            update_b = beta2 * update_b + (1 - beta2) * db ** 2
            # Bias Correction
            momentum_w = momentum_w / (1 - beta1 ** (i + 1))
            momentum_b = momentum_b / (1 - beta1 ** (i + 1))
            update_w = update_w / (1 - beta2 ** (i + 1))
            update_b = update_b / (1 - beta2 ** (i + 1))
            # Update of Parameters
            w = w - (lr / (np.sqrt(update_w) + epsilon)) * momentum_w
            b = b - (lr / (np.sqrt(update_b) + epsilon)) * momentum_b

        if loss == 'mse':
            l = mse(x, y, w, b)
            print(f'Loss after {i}th epoch = {l}')
            l_list.append(l)
        elif loss == 'cross_entropy':
            l = cross_entropy(x, y, w, b)
            print(f'Loss after {i}th epoch = {l}')
            l_list.append(l)
        w_list.append(w)
        b_list.append(b)

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss vs Epoch Curve\nAlgorithm: Mini-Batch Adam\nBatch Size = {}\nLearning Rate = {}\nLoss Function = {}'.format(batch_size, lr, loss))
    plt.plot(ep, l_list)
    plt.show()

    return w_list, b_list


In [34]:

# Mini-Batch Adam...

def MiniBatchAdam(x, y, epochs, batch_size, loss, lr, l2_reg):
    
    # Feature Scaling - Min-Max scaling
    x = (x - np.min(x)) / (np.max(x) - np.min(x))

    w = np.random.randn(x.shape[0], 1) * 0.01
    b = np.zeros((1, 1))
    epsilon = 1e-8
    beta1 = 0.9
    beta2 = 0.999
    momentum_w, momentum_b = 0, 0
    update_w, update_b = 0, 0
    l_list = []
    w_list = []
    b_list = []
    ep = [i for i in range(epochs + 1)]

    num_samples = x.shape[1]
    num_batches = num_samples // batch_size
    if num_samples % batch_size != 0:
        num_batches += 1

    # Shuffle the data
    idx = np.random.permutation(num_samples)
    x = x[:, idx]
    y = y[:, idx]

    selected_samples = int(num_samples * 0.15)
    x = x[:, :selected_samples]
    y = y[:, :selected_samples]

    for i in range(epochs + 1):
        for j in range(num_batches):
            start_idx = j * batch_size
            end_idx = (j + 1) * batch_size

            x_batch = x[:, start_idx:end_idx]
            y_batch = y[:, start_idx:end_idx]

            dw, db = 0, 0
            if loss == 'mse':
                dw += grad_w_mse(x_batch, y_batch, w, b) + l2_reg * w
                db += grad_b_mse(x_batch, y_batch, w, b)
            elif loss == 'cross_entropy':
                dw += grad_w_cross(x_batch, y_batch, w, b) + l2_reg * w
                db += grad_b_cross(x_batch, y_batch, w, b)

            # Momentum
            momentum_w = beta1 * momentum_w + (1 - beta1) * dw
            momentum_b = beta1 * momentum_b + (1 - beta1) * db
            # Update History
            update_w = beta2 * update_w + (1 - beta2) * dw ** 2
            update_b = beta2 * update_b + (1 - beta2) * db ** 2
            # Bias Correction
            momentum_w = momentum_w / (1 - beta1 ** (i + 1))
            momentum_b = momentum_b / (1 - beta1 ** (i + 1))
            update_w = update_w / (1 - beta2 ** (i + 1))
            update_b = update_b / (1 - beta2 ** (i + 1))
            # Update of Parameters
            w = w - (lr / (np.sqrt(update_w) + epsilon)) * momentum_w
            b = b - (lr / (np.sqrt(update_b) + epsilon)) * momentum_b

        if loss == 'mse':
            l = mse(x, y, w, b)
            print(f'Loss after {i}th epoch = {l}')
            l_list.append(l)
        elif loss == 'cross_entropy':
            l = cross_entropy(x, y, w, b)
            print(f'Loss after {i}th epoch = {l}')
            l_list.append(l)
        w_list.append(w)
        b_list.append(b)

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss vs Epoch Curve\nAlgorithm: Mini-Batch Adam\nBatch Size = {}\nLearning Rate = {}\nLoss Function = {}'.format(batch_size, lr, loss))
    plt.plot(ep, l_list)
    plt.show()

    return w_list, b_list

In [35]:

# Normalize the input data
mean = np.mean(x)
std = np.std(x)
x = (x - mean) / std

In [38]:

# Additional modifications:
# 2. Apply L2 regularization (adjust the regularization strength as desired)
l2_reg = 0.0001

# 5. Reduce the batch size (adjust the batch size as desired)
batch_size = 32

In [39]:

W, B = MiniBatchAdam(x, y, 500, batch_size, 'mse', 0.01, l2_reg)



Loss after 0th epoch = nan


KeyboardInterrupt: 

In [28]:
W, B = MiniBatchAdam(x, y, 500, 10, 'mse', 0.001)



Loss after 0th epoch = nan
Loss after 1th epoch = nan
Loss after 2th epoch = nan
Loss after 3th epoch = nan
Loss after 4th epoch = nan
Loss after 5th epoch = nan
Loss after 6th epoch = nan


KeyboardInterrupt: 

In [13]:
import numpy as np

def adam_optimizer(params, grads, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """
    Adam optimization algorithm.

    Args:
    params: Dictionary containing the model's parameters.
    grads: Dictionary containing the gradients of the parameters.
    learning_rate: Learning rate for the optimization (default: 0.001).
    beta1: Exponential decay rate for the first moment estimates (default: 0.9).
    beta2: Exponential decay rate for the second moment estimates (default: 0.999).
    epsilon: Small value to avoid division by zero (default: 1e-8).

    Returns:
    Updated parameters.
    """

    # Initialize moments
    v = {}
    s = {}
    for param_name, grad in grads.items():
        v[param_name] = np.zeros_like(grad)
        s[param_name] = np.zeros_like(grad)

    # Update parameters
    for param_name, param in params.items():
        grad = grads[param_name]

        # Update biased first moment estimate
        v[param_name] = beta1 * v[param_name] + (1 - beta1) * grad

        # Update biased second raw moment estimate
        s[param_name] = beta2 * s[param_name] + (1 - beta2) * np.square(grad)

        # Compute bias-corrected first and second moment estimates
        v_corrected = v[param_name] / (1 - beta1)
        s_corrected = s[param_name] / (1 - beta2)

        # Update parameters
        params[param_name] -= learning_rate * v_corrected / (np.sqrt(s_corrected) + epsilon)

    return params

# Example usage
# Define your model parameters here
params = {
    'w1': np.random.randn(32, 32, 3, 64),
    'b1': np.zeros((1, 64)),
    'w2': np.random.randn(3, 3, 64, 128),
    'b2': np.zeros((1, 128)),
    # Add more parameters as needed
}

# Define your gradients here
grads = {
    'dw1': np.random.randn(32, 32, 3, 64),
    'db1': np.zeros((1, 64)),
    'dw2': np.random.randn(3, 3, 64, 128),
    'db2': np.zeros((1, 128)),
    # Add more gradients as needed
}

# Apply Adam optimizer
params = adam_optimizer(params, grads, learning_rate=0.001)


KeyError: 'w1'

In [15]:
x = x_train
y = y_train


#Helper Functions
def f(x,w,b):
    '''Sigmoid Function'''
    f = 1/(1+np.exp(-(w*x+b)))
    return f
def mse(x,y,w,b):
    '''Mean Squared Loss Function'''
    L = 0.0
    for i in range(x.shape[0]):
        L += 0.5*(y[i]-f(x[i],w,b))**2
    return L
def cross_entropy(x,y,w,b):
    '''Cross Entropy Loss Function'''
    L = 0.0
    for i in range(x.shape[0]):
        L += -(y[i]*np.log(f(x[i],w,b)))
    return L
def grad_w_mse(x, y, w, b):
    fx = f(x, w, b)
    dw = np.dot((fx - y) * fx * (1 - fx), x.T)
    return dw

def grad_b_mse(x, y, w, b):
    fx = f(x, w, b)
    db = np.sum((fx - y) * fx * (1 - fx), axis=1, keepdims=True)
    return db

def grad_w_cross(x,y,w,b):
    fx = f(x,w,b) 	
    dw = (- y)*(1-fx)*x
    return dw
def grad_b_cross(x,y,w,b):
    fx = f(x,w,b) 
    db = (- y)*(1-fx)
    return db

#Gradient Discent
def Adam(x,y,epochs,batch_size,loss,lr):
    w = np.random.randn()
    b = np.random.randn()
    epsilon = 1e-8
    beta1 = 0.9
    beta2 = 0.999
    momentum_w,momentum_b = 0,0
    update_w, update_b = 0,0
    l_list = []
    w_list = []
    b_list = []
    points = 0
    ep = [i for i in range(epochs+1)]
    dw,db = np.zeros_like(w), np.zeros_like(b)

    for i in range(epochs + 1):
        dw, db = np.zeros_like(w), np.zeros_like(b)
        for j in range(0, x.shape[0], batch_size):
            x_batch = x[j : j + batch_size]
            y_batch = y[j : j + batch_size]

            if loss == "mse":
                dw += grad_w_mse(x_batch, y_batch, w, b)
                db += grad_b_mse(x_batch, y_batch, w, b)
            elif loss == "cross_entropy":
                # ...
                pass

            points += 1
            if(points % batch_size == 0):
                #Momentum
                momentum_w = beta1 * momentum_w + (1 - beta1) * dw
                momentum_b = beta1 * momentum_b + (1 - beta1) * db
                #Update History
                update_w = beta2 * update_w + (1 - beta2) * dw**2
                update_b = beta2 * update_b + (1 - beta2) * db**2 
                #Bias Correction
                momentum_w = momentum_w /(1 - math.pow(beta1,i+1))  
                momentum_b = momentum_b /(1 - math.pow(beta1,i+1))
                update_w = update_w /(1 - math.pow(beta2,i+1))  
                update_b = update_b /(1 - math.pow(beta2,i+1))
                #Update of Parameters
                w = w - (lr/np.sqrt(update_w + epsilon))*momentum_w
                b = b - (lr/np.sqrt(update_b + epsilon))*momentum_b
                dw,db = 0,0
        if (loss == 'mse'):
            print('Loss after {}th epoch = {}\n'.format(i,mse(x,y,w,b)[0]))
            l_list.append(mse(x,y,w,b)[0])
        elif (loss == 'cross_entropy'):
            print('Loss after {}th epoch = {}\n'.format(i,cross_entropy(x,y,w,b)[0]))
            l_list.append(cross_entropy(x,y,w,b)[0])
        w_list.append(w[0])
        b_list.append(b[0])

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss vs Epoch Curve\nAlgotithm :Mini Batch Adam\nBatch Size = {}\nInitial Learning Rate = {}\nLoss Function = {}'.format(batch_size,lr,loss))
    plt.plot(ep,l_list)
    plt.show()

    return w_list,b_list


x = x_train.reshape(x_train.shape[0], -1)
y = y_train

W,B = Adam(x,y,500,10,'mse',0.01)

#Error Surface MSE
w = np.linspace(-10,10,num = 1000,dtype = np.float)
b = np.linspace(-10,10,num = 1000,dtype = np.float)
w,b = np.meshgrid(w,b)

mse_list = []
for i in range(w.shape[0]):
    Loss = mse(x,y,w[i],b[i])
    mse_list.append(Loss)
fig = plt.figure()
ax = fig.gca(projection='3d')
surf = ax.plot_surface(w, b, mse_list, cmap=cm.coolwarm,linewidth=0, antialiased=False)
plt.title('MSE Error Suface')
plt.show()

#Error Surface Cross Entropy
cross_list = []
for i in range(w.shape[0]):
    Loss = cross_entropy(x,y,w[i],b[i])
    cross_list.append(Loss)

fig = plt.figure()
ax = fig.gca(projection='3d')
surf = ax.plot_surface(w, b, cross_list, cmap=cm.coolwarm,linewidth=0, antialiased=False)
plt.title('Cross Entropy Error Suface')
plt.show()


ValueError: operands could not be broadcast together with shapes (10,3072) (10,10) 