In [8]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
x_train, y_train, x_test, y_test = pd.read_csv("Classification_train.csv").sample(frac=1).reset_index(drop=True).pipe(lambda df: (df.iloc[:24000, 1:].values, df.iloc[:24000, 0].values, df.iloc[24000:, 1:].values, df.iloc[24000:, 0].values))
y_train = pd.get_dummies(y_train, dtype='int').values.T
y_test = pd.get_dummies(y_test, dtype = 'int').values.T
x_train = x_train.T / 255
x_test = x_test.T / 255
relu = lambda z: np.maximum(0, z)
tanh = lambda z: np.tanh(z)
softmax = lambda z: np.exp(z) / np.sum(np.exp(z), axis=0)
tanh_derivative = lambda z: 1 - np.power(tanh(z), 2)
relu_derivative = lambda z: np.where(z > 0, 1, 0)
compute_cost = lambda AL, Y: -np.sum(Y * np.log(AL))

In [9]:
def initialize_parameters(layers_dim):
    parameters = {}
    for i in range(1, len(layers_dim)):
        parameters["w" + str(i)] = np.random.randn(layers_dim[i], layers_dim[i - 1]) * np.sqrt(2 / layers_dim[i - 1])
        parameters["b" + str(i)] = np.zeros((layers_dim[i], 1))
    return parameters

In [10]:
def forward_propagation(X, parameters, activation):
    caches = {}
    A = X
    L = len(parameters) // 2
    for l in range(1, L+1):
        A_prev = A
        Z = np.dot(parameters['w' + str(l)], A_prev) + parameters['b' + str(l)]
        if activation == 'tanh' and l != L:
            A = tanh(Z)
        elif activation == 'relu' and l != L:
            A = relu(Z)
        else:
            A = softmax(Z)
        caches['Z' + str(l)] = Z
        caches['A' + str(l)] = A
    return A, caches

In [11]:
def backward_propagation(AL,X, Y, caches, parameters, activation, learning_rate):
    grads = {}
    L = len(caches) // 2
    m = AL.shape[1]
    dZ = AL - Y
    grads["dw" + str(L)] = 1./m * np.dot(dZ, caches['A' + str(L-1)].T)
    grads["db" + str(L)] = 1./m * np.sum(dZ, axis=1, keepdims=True)
    for l in reversed(range(L-1)):
        dA_prev = np.dot(parameters['w' + str(l + 2)].T, dZ)
        if activation == 'tanh':
            dZ = np.multiply(dA_prev, tanh_derivative(caches['Z' + str(l + 1)]))
        elif activation == 'relu':
            dZ = np.multiply(dA_prev, relu_derivative(caches['Z' + str(l + 1)]))
        grads["dw" + str(l + 1)] = 1./m * np.dot(dZ, caches['A' + str(l)].T if l != 0 else X.T)
        grads["db" + str(l + 1)] = 1./m * np.sum(dZ, axis=1, keepdims=True)
    for l in range(len(parameters)//2):
        parameters["w" + str(l+1)] -= learning_rate * grads["dw" + str(l+1)]
        parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]
    return parameters

In [12]:
def compute_accuracy(X, Y, parameters, activation):
    AL, _ = forward_propagation(X, parameters, activation)
    predictions = np.argmax(AL, axis=0)
    labels = np.argmax(Y, axis=0)
    accuracy = np.mean(predictions == labels) * 100
    return accuracy

In [13]:
def train_model(X,Y,X_T,Y_T, layers_dim, activation='relu', learning_rate=0.96, num_iterations=300):
    parameters = initialize_parameters(layers_dim)
    for i in range(0, num_iterations):
        AL, caches = forward_propagation(X, parameters, activation)
        cost = compute_cost(AL, Y)
        parameters = backward_propagation(AL, X,Y, caches, parameters, activation, learning_rate)
        if i % 100 == 0:
            print (f"Cost after iteration {i}: {cost}")
    accuracy = compute_accuracy(X, Y, parameters,activation)
    test_accuracy = compute_accuracy(X_T,Y_T,parameters, activation)
    print("Train Accuracy:", accuracy)
    print("Test Accuracy:",test_accuracy)
    return parameters, accuracy,test_accuracy

In [14]:
param, train_accuracy, test_accuracy = (train_model(x_train, y_train,x_test, y_test, layers_dim=[784,386,64,10], activation='relu'))


Cost after iteration 0: 57793.63219272198
Cost after iteration 100: 2870.2940149188735
Cost after iteration 200: 1023.5463535504265
Train Accuracy: 99.52499999999999
Test Accuracy: 98.25
