In [1]:
#HouseKeeping
import numpy as np 
import pandas as pd 

data = pd.read_csv('TMNIST_Data.csv')

In [2]:
data = data.drop("names",axis=1)

In [3]:
data = np.array(data)
np.random.shuffle(data)
results = 10 #number of nodes output

test_data = data[0:1000].T
test_y  = test_data[0]
test_x  = test_data[1:]/255

train_data = data[1000:].T
train_y = train_data[0]
train_x = train_data[1:]/255

pixels, train_trials = train_x.shape

train_y.size

28900

In [4]:
def init_parameters(H, N, pixels, results):
    params = []
    # 1st layer
    params.append([
        np.random.rand(N, pixels) - 0.5,  
        np.random.rand(N, 1) - 0.5       
    ])
    # the rest
    for _ in range(H-1):
        params.append([
            np.random.rand(N, N) - 0.5,   
            np.random.rand(N, 1) - 0.5    
        ])
    # output layer
    params.append([
        np.random.rand(results, N) - 0.5,  
        np.random.rand(results, 1) - 0.5   
    ])
    return params


def ReLu(Z):
    return np.maximum(0,Z)
#def softmax(Z): 
    #return np.exp(Z)/sum(np.exp(Z))

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def forward_propogation(IN, params):
    Zs, As = [], [IN]  # As starts with input X
    
    for W, b in params[:-1]:  # Process all but last layer
        Z = np.dot(W, As[-1]) + b
        A = ReLu(Z)  # or sigmoid for first layer
        Zs.append(Z)
        As.append(A)
    
    # Output layer (softmax)
    W_out, b_out = params[-1]
    Z_out = np.dot(W_out, As[-1]) + b_out
    OUT = softmax(Z_out)
    
    return OUT, Zs, As
        

In [5]:
def one_hot(Y):
    Y_one_hot = np.zeros((Y.size, Y.max()+1)) #LEARN NEED (())
    Y_one_hot[np.arange(Y.size),Y] = 1    #LEARN
    return Y_one_hot.T

def derv_sigmoid(Z):
    return np.exp(-Z)/(1+np.exp(-Z))**2
def derv_ReLu(Z): 
    return Z > 0

def back_propogation(OUT,train_y, Z, A, params):
    Y = one_hot(train_y)
    m = train_y.size

    gradient = []

    dz = OUT - Y
    dw = 1/m * np.dot(dz, A[-2].T)  # Changed from A[-1] to A[-2]
    db = 1/m * np.sum(dz, axis=1, keepdims=True)
    gradient.append([dw, db])
    
    for i in range(1, len(Z)):
        dz = np.dot(params[-i][0].T, dz) * derv_ReLu(Z[-i])
        # For first hidden layer, use input X instead of A[-1-i]
        if i == len(Z)-1:
            dw = 1/m * np.dot(dz, train_x.T)
        else:
            dw = 1/m * np.dot(dz, A[-1-i].T)
        db = 1/m * np.sum(dz, axis=1, keepdims=True)
        gradient.append([dw, db])

        print(f"Layer {i} gradient shapes:")
        print(f"  dW shape: {dw.shape} (should match {params[i][0].shape})")
        print(f"  db shape: {db.shape} (should match {params[i][1].shape})")
    
    return gradient

In [6]:
def get_accuracy(A,Y):
    return sum(np.argmax(A,0) == Y)/Y.size #LEARN ARGMAX

def back_propogation(OUT, train_y, Z, A, params):
    Y = one_hot(train_y)
    m = train_y.size

    gradient = []

    dz = OUT - Y
    dw = 1/m * np.dot(dz, A[-2].T)  # Changed from A[-1] to A[-2]
    db = 1/m * np.sum(dz, axis=1, keepdims=True)
    gradient.append([dw, db])
    
    for i in range(1, len(params)):  # Changed from len(Z) to len(params)
        dz = np.dot(params[-i][0].T, dz) * derv_ReLu(Z[-i])
        dw = 1/m * np.dot(dz, A[-i-1].T)  # Changed index to -i-1
        db = 1/m * np.sum(dz, axis=1, keepdims=True)
        gradient.append([dw, db])
    
    return gradient[::-1]  # Reverse to match params order

def gradient_descent(H, N, X, Y, pixels, results, iterations, alpha):
    params = init_parameters(H, N, pixels, results)
    for i in range(iterations):
        OUT, Z, A = forward_propogation(X, params)
        gradient = back_propogation(OUT, Y, Z, A, params)
        
        # Update parameters
        for j in range(len(params)):
            params[j][0] -= alpha * gradient[-j][0]
            params[j][1] -= alpha * gradient[-j][1]
        
        if i % 100 == 0:
            alpha = alpha/1
            print(f"Iteration:{i}")
            print(get_accuracy(A[-1], Y))

    return params

In [7]:
params = init_parameters(3,10,pixels,results)
OUT,Z,A = forward_propogation(train_x,params)
gradient = back_propogation(OUT,train_y,Z,A,params)

In [10]:
params = gradient_descent(2,10,train_x,train_y,pixels,results,1000,0.01)

Iteration:0
0.09297577854671281
Iteration:100
0.11107266435986159
Iteration:200
0.12653979238754326
Iteration:300
0.17653979238754325
Iteration:400
0.21332179930795847
Iteration:500
0.20878892733564014
Iteration:600
0.1993771626297578
Iteration:700
0.18321799307958478
Iteration:800
0.1652249134948097
Iteration:900
0.15916955017301038
