In [1]:
# import necessary imports
import numpy as np
#from numpy_ml.neural_nets.optimizers import Adam
import torch
from torch import nn, optim

In [2]:
# data preprocessing 
    # load encoded train/validation/test sets
    # chunken und batchen

batch_size = 1
chunk_size = 1

def get_batch(input, batch_size):
    return np.array([[1,2,3], [3,4,7]]), np.array([[2,3,3], [4,7,7]]) # pls adapt. obv. :))))))

In [3]:
class neural_embedding:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.token_embedding_table = np.random.rand(vocab_size, vocab_size)

    def calculate_softmax(self, x):
        """Takes input array x and returns softmax."""
        soft_x = np.exp(x - np.max(x))
        softer_x = soft_x / np.sum(soft_x)
        return softer_x

    def calculate_cross_entropy(self, y_hatless, y_hat):
        """
        Takes target (y_hatless) and prediction (y_hat) and computes cross entropy loss.
        """
        # get vocab_size
        _, _, vocab_size = y_hat.shape        
        y_hat = y_hat.reshape(y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])
        y_hatless_flat = y_hatless.reshape(-1)
        # one-hot encode targets
        y_hatless_hot = np.eye(vocab_size)[y_hatless_flat]
       
        y_hat = self.calculate_softmax(y_hat)
    
        return -np.sum(y_hatless_hot*np.log(y_hat))
    
    def forward(self, idx, targets=None):
        """
        Implements forward pass with an unnecessary logitte function 
        which i only did not delete because now I'm emotionally attached.
        Args:
            idx(np.array): (B,T) numpy array of integers
            targets(np.array): (B,T) numpy array of integers
        Returns:
            input_logits(np.array)
            sometimes also: targets(np.array)
        """
        batch_size, chunk_size = idx.shape
        logits = np.zeros((batch_size, chunk_size, (self.token_embedding_table[0].size)))

        def logitte(batch_size, chunk_size, input):
            for batch in range(batch_size):
                for chunk in range(chunk_size):
                    # (B,T,C) b=batch_size, t="time"=chunk_size, c=vocab_size
                    logits[batch][chunk] = self.token_embedding_table[input[batch][chunk]]
                    
            return logits

        input_logits = logitte(batch_size, chunk_size, idx)
        
        if targets is not None:
            loss = self.calculate_cross_entropy(targets, input_logits)

            return input_logits, loss

        return input_logits

    
    def backward(self, targets, input_logits):
        # need to do the same reshaping as we did for cross entropy, apparently
        targets_flat = targets.reshape(-1)
        one_hot_targets = np.eye(self.vocab_size)[targets_flat]

        # shape after: ((batch_size*chunk_size), vocab_size)
        input_logits_2d = input_logits.reshape(input_logits.shape[0]*input_logits.shape[1], input_logits.shape[2])

        # somehow this is supposedly the combiantion of the derivative of softmax with the derivative of the CCE
        delta = one_hot_targets - input_logits_2d

        # want shape (80,80) for matrix multiplication, but with correct indices (use one-hot targets for that)
        delta_indexed = np.dot(one_hot_targets.transpose(),delta)

        # compute gradient for weight matrix: dot product between the transpose of the to layer and delta vector computed above
        gradient = (self.token_embedding_table @ delta_indexed) 

    
        return gradient

    

In [11]:
def initialize_adam(parameters) :
    """
    Initializes v and s as two python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL" 
                - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.
    
    Arguments:
    parameters -- python dictionary containing your parameters.
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    
    Returns: 
    v -- python dictionary that will contain the exponentially weighted average of the gradient.
                    v["dW" + str(l)] = ...
                    v["db" + str(l)] = ...
    s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...

    """
    
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initialization of v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros((parameters["W" + str(l+1)]).shape)
        v["db" + str(l+1)] = np.zeros((parameters["b" + str(l+1)]).shape)
        s["dW" + str(l+1)] = np.zeros((parameters["W" + str(l+1)]).shape)
        s["db" + str(l+1)] = np.zeros((parameters["b" + str(l+1)]).shape)
    
    return v, s

In [19]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    
    # In the adam Paper by default beta1 is taken as 0.9 and beta2 as 0.999 and the epsilon as 10^(-8)
    
    """
    Update parameters using Adam
    
    Arguments:
    parameters -- python dictionary containing your parameters:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    learning_rate -- the learning rate, scalar.
    beta1 -- Exponential decay hyperparameter for the first moment estimates 
    beta2 -- Exponential decay hyperparameter for the second moment estimates 
    epsilon -- hyperparameter preventing division by zero in Adam updates

    Returns:
    parameters -- python dictionary containing your updated parameters 
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    """
    
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(L):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
        v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads["dW" + str(l+1)]
        v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads["db" + str(l+1)]

        # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] /(1 - beta1 ** t)
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] /(1 - beta1 ** t)

        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
        s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1 - beta2) * (np.square(grads["dW" + str(l+1)]) )
        s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1 - beta2) * (np.square(grads["db" + str(l+1)]) )

        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] /(1 - beta2 ** t)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] /(1 - beta2 ** t)

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate *  v_corrected["dW" + str(l+1)] /(np.sqrt(s_corrected["dW" + str(l+1)]) + epsilon)
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v_corrected["db" + str(l+1)] / (np.sqrt(s_corrected["db" + str(l+1)]) + epsilon)

    return parameters, v, s

In [None]:
# currently do not use loss at all, so something is probably very wrong

def train(model, optimiser, token_embedding_table):
    batch_size=32
    for steps in range(10): # TODO: please increase
        # sample batch of data
        xb, yb = get_batch('train', batch_size) # TODO: pls adapt to above fct
        optimiser.zero_grad()  # reset gradients
        # get logits and loss
        logits, loss = model.forward(xb, yb)
        print(f"loss: {loss}")

        gradient = model.backward(yb, logits)
        token_embedding_table.grad = torch.tensor(gradient, dtype=torch.float32)
        optimiser.step()  # apply gradients to parameters
        diff = model.token_embedding_table - np.array(token_embedding_table)
        print(f"Difference in weights before - afte Adam: {diff}")
        model.token_embedding_table = np.array(token_embedding_table)
        

In [26]:
vocab_size = 80

my_neural_embedding = neural_embedding(vocab_size)
param_dict = {"weight": my_neural_embedding.token_embedding_table} # one entry, key is weight and value is my_neural

tensor = torch.tensor(my_neural_embedding.token_embedding_table, dtype=torch.float32)
optimiser = optim.Adam([tensor], lr=0.3)
train(my_neural_embedding, optimiser, tensor)


loss: 36.64729368813076
Difference in weights before - after Adam: [[-0.29999993 -0.29999993 -0.30000002 ... -0.29999996 -0.29999998
  -0.30000001]
 [-0.29999996 -0.30000003 -0.29999999 ... -0.29999997 -0.29999997
  -0.29999997]
 [-0.29999995 -0.3        -0.29999996 ... -0.30000005 -0.29999996
  -0.29999997]
 ...
 [-0.30000003 -0.29999998 -0.29999997 ... -0.3        -0.29999998
  -0.30000002]
 [-0.30000001 -0.29999999 -0.29999995 ... -0.29999993 -0.29999995
  -0.29999997]
 [-0.30000003 -0.29999994 -0.29999997 ... -0.30000004 -0.30000003
  -0.29999998]]
loss: 36.64729382526401
Difference in weights before - after Adam: [[-0.2822063  -0.27971232 -0.26941466 ... -0.27572685 -0.27336824
  -0.27926672]
 [-0.28566408 -0.28463483 -0.27554727 ... -0.28230542 -0.27712432
  -0.28205895]
 [-0.28234565 -0.28217876 -0.26900077 ... -0.27793837 -0.27181703
  -0.2796803 ]
 ...
 [-0.29399294 -0.2975868  -0.28512794 ... -0.29538053 -0.2817943
  -0.29099357]
 [-0.27659386 -0.27931553 -0.27403176 ... -0.2

  diff = model.token_embedding_table - np.array(token_embedding_table)
  model.token_embedding_table = np.array(token_embedding_table)
