In [5]:
import numpy as np
import string

In [12]:
## Will use the English alphabet as my data
inputs = np.array([
    ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"],
    ["Z","Y","X","W","V","U","T","S","R","Q","P","O","N","M","L","K","J","I","H","G","F","E","D","C","B","A"],
    ["B","D","F","H","J","L","N","P","R","T","V","X","Z","A","C","E","G","I","K","M","O","Q","S","U","W","Y"],
    ["M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H","I","J","K","L"],
    ["H","G","F","E","D","C","B","A","L","K","J","I","P","O","N","M","U","T","S","R","Q","X","W","V","Z","Y"]
])

expected = np.array([
    ["B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A"],
    ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"],
    ["C","E","G","I","K","M","O","Q","S","U","W","Y","A","B","D","F","H","J","L","N","P","R","T","V","X","Z"], 
    ["N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H","I","J","K","L","M"],
    ["I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H"]
])

# However, this isnt good enough with how it is. We need to make the data usable

def string_to_one_hot(inputs: np.ndarray) -> np.ndarray:
    char_to_index = {char: i for i, char in enumerate(string.ascii_uppercase)}

    one_hot_inputs = []
    for row in inputs:
        one_hot_list = []
        for char in row:
            if char.upper() in char_to_index:
                one_hot_vector = np.zeros((len(string.ascii_uppercase), 1))
                one_hot_vector[char_to_index[char.upper()]] = 1
                one_hot_list.append(one_hot_vector)
        one_hot_inputs.append(one_hot_list)

    return np.array(one_hot_inputs)

one_hot_outputTEST =  string_to_one_hot(expected)
print(one_hot_outputTEST.shape)




(5, 26, 26, 1)


This code structures my data so that the first element of each sequence (row) is processed simultaneously across all sequences. Since we have 5 sequences, the first letter from each of the 5 sequences is processed together in the first time step, followed by the second letter from each sequence in the next time step, and so on.

Since we are using one-hot encoding, each letter is converted into a binary array of size 26, where all values are 0 except for a single 1 at the index corresponding to the letter's position in the alphabet. This transformation ensures that each letter is represented uniquely in a numerical format suitable for an RNN.

***Why don't we just convert to ASCII? This excess storage seems overly complicated?***
The reason for this excess storage is to reduce bias when processed through our RNN. Each letter needs to be treated equally. One-hot encoding solves this problem because it treats every letter independently, meaning there is no "distance" between letters unless the model learns it naturally.

The storage is excessive and there are ways to deal with that such as word embeddings. However, lets stick to the tutorial. 

## ***Input Layer***
- **inputs** = which is the sequential data in the form of numpy arrays.
- **U** = is the weight of the matrix connecting input to the hidden layer
- **delta_U** = is the gradient calculated during Back Propagation Through Time (BPTT)

In [13]:
class InputLayer:
    inputs: np.ndarray
    U: np.ndarray = None
    delta_U: np.ndarray = None

    def __init__(self, inputs: np.ndarray, hidden_size: int) -> None:
        self.inputs = inputs
        self.U = np.random.uniform(low=0, high=1, size=(hidden_size, len(inputs[0])))
        self.delta_U = np.zeros_like(self.U)

        def get_input(self, time_step: int) -> np.ndarray:
            return self.inputs[time_step]
        
        def weighted_sum(self, time_step:int) -> np.ndarray:
            return self.U @ self.get_input(time_step)
        
        def calculate_deltas_per_step(self, time_step, delta_weighted_sum: np.ndarray) -> None:
            # (h_dimension, 1) @ (1, input_size) = (h_dimension, input_size)
            self.delta_U += delta_weighted_sum @ self.get_input(time_step).T
        
        def update_weights_and_bias(self, learning_rate: float) -> None:
            self.U -= learning_rate * self.delta_U
            

    

## ***Hidden Layer***
- **States** = Stores activation of all time steps (internal memory of network)
- **W** = Recurrent weight matrix
- **delta_W** = gradient of W during BPTT
- **bias** = b in the math formulas
- **delta_bias** = gradient of b
- **next_delta_activation** = stores the derivative of next steps loss function w.r.t. current activation, from this formula


In [None]:
class HiddenLayer:
    states                : np.ndarray = None
    W                     : np.ndarray = None
    delta_W               : np.ndarray = None
    bias                  : np.ndarray = None
    delta_bias            : np.ndarray = None
    next_delta_activation : np.ndarray = None

    def __init__(self, vocab_size: int, size: int) -> None:
        self.W                     = np.random.uniform(low=0, high=1, size=(size,size))
        self.bias                  = np.random.uniform(low=0, high=1, size=(size,1))
        self.states                = np.zeros(shape=(vocab_size, size, 1))
        self.next_delta_activation = np.zeros(self.bias)
        self.delta_bias            = np.zeros_like(self.bias)
        self.delta_W               = np.zeros_like(self.W)
    
    def get_hidden_state(self, time_step: int) -> np.ndarray:
        # If startin out at the beginning of seq, a[t-1] will return 0's
        if time_step < 0:
            return np.zeros_like(self.states[0])
        return self.states[time_step]
    
    def set_hidden_state(self, time_step: int, hidden_state: np.ndarray) -> np.ndarray:
        self.states[time_step] = hidden_state

    def activate(self, weighted_input: np.ndarray, time_step: int) -> np.ndarray:
        previous_hidden_state = self.get_hidden_state(time_step - 1)
        # W @ h_prev => (h_dimension, h_dimension) @ (h_dimension, 1) = (h_dimension, 1)
        weighted_hidden_state = self.W @ previous_hidden_state
        weighted_sum = weighted_input + weighted_hidden_state + self.bias
        activation = np.tanh(weighted_sum) #(h_dimension, 1)
        self.set_hidden_state(time_step, activation)
        return activation

    def calculate_deltas_per_step(self, time_step: int, delta_output: np.ndarray) -> np.ndarray:
        delta_activation = delta_output + self.next_delta_activation
        delta_weighted_sum = delta_activation * (1 - self.get_hidden_state(time_step) **2)
        self.next_delta_activation = self.W.T @ delta_weighted_sum
        self.delta_W += delta_weighted_sum @ self.get_hidden_state(time_step - 1).T
        self.delta_bias += delta_weighted_sum
        return delta_weighted_sum
    
    def update_weights_and_bias(self, learning_rate: float) -> None:
        self.W -= learning_rate * self.delta_W
        self.bias -= learning_rate * self.delta_bias
