In [2]:
# import necessary imports
import numpy as np
#from numpy_ml.neural_nets.optimizers import Adam
import torch
from torch import nn, optim
from torch.nn.functional import one_hot

In [20]:
def get_batch(input, batch_size,chunk_size):

    input_batch = []
    # print(type(input_batch))
    target_batch = []
    idx = np.random.randint(0,len(input)-(chunk_size+1),size=batch_size)
    for i in range(0,len(idx)-1):
        input_batch.append(input[idx[i]:idx[i]+chunk_size])
        target_batch.append(input[idx[i]+1:idx[i]+(chunk_size+1)])
    
    input_batch = np.array(input_batch)
    target_batch = np.array(target_batch)

    return input_batch, target_batch

In [10]:
with open (r"indices_text.txt", 'r') as f:
  indices_text = eval(f.read())
  
x,y = get_batch(indices_text,4,8)
print(x.shape)
print(x)
print(y.shape)
print(y)

<class 'list'>
(3, 8)
[[1263  791  190  360  608  127 1263 1285]
 [ 438 1170 1539 1459  462  622 1401  966]
 [1285  335 1540 1336   14 1285  485 1274]]
(3, 8)
[[ 791  190  360  608  127 1263 1285  413]
 [1170 1539 1459  462  622 1401  966  797]
 [ 335 1540 1336   14 1285  485 1274 1386]]


In [48]:
class neural_embedding:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.token_embedding_table = np.random.rand(vocab_size, vocab_size)

    def calculate_softmax(self, x):
        """Takes input array x and returns softmax."""
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Numerical stability
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)


    def calculate_cross_entropy(self, y_hatless, y_hat):
        """
        Takes target (y_hatless) and prediction (y_hat) and computes cross entropy loss.
        """
        # get vocab_size
        _, _, vocab_size = y_hat.shape        
        y_hat = y_hat.reshape(y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])
        y_hatless_flat = y_hatless.reshape(-1)
        # one-hot encode targets
        y_hatless_hot = np.eye(vocab_size)[y_hatless_flat]
       
        y_hat = self.calculate_softmax(y_hat)
        
        # Clip predictions to avoid log(0)
        y_hat = np.clip(y_hat, 1e-15, 1.0) 
        return -np.sum(y_hatless_hot*np.log(y_hat))
    
    def forward(self, idx, targets=None):
        """
        Implements forward pass with an unnecessary logitte function 
        which i only did not delete because now I'm emotionally attached.
        Args:
            idx(np.array): (B,T) numpy array of integers
            targets(np.array): (B,T) numpy array of integers
        Returns:
            input_logits(np.array)
            sometimes also: targets(np.array)
        """
        batch_size, chunk_size = idx.shape
        logits = np.zeros((batch_size, chunk_size, (self.token_embedding_table[0].size)))

        def logitte(batch_size, chunk_size, input):
            for batch in range(batch_size):
                for chunk in range(chunk_size):
                    # (B,T,C) b=batch_size, t="time"=chunk_size, c=vocab_size
                    logits[batch][chunk] = self.token_embedding_table[input[batch][chunk]]
                    
            return logits

        input_logits = logitte(batch_size, chunk_size, idx)
        
        if targets is not None:
            loss = self.calculate_cross_entropy(targets, input_logits)

            return input_logits, loss

        return input_logits

    
    def backward(self, targets, input_logits):
        batch_size = 32
        chunk_size = 8
        # need to do the same reshaping as we did for cross entropy, apparently
        targets_flat = targets.reshape(-1)
        one_hot_targets = np.eye(self.vocab_size)[targets_flat]

        # shape after: ((batch_size*chunk_size), vocab_size)
        input_logits_2d = input_logits.reshape(input_logits.shape[0]*input_logits.shape[1], input_logits.shape[2])
        
        soft_input = self.calculate_softmax(input_logits_2d)
        # somehow this is supposedly the combiantion of the derivative of softmax with the derivative of the CCE
        # delta = one_hot_targets - soft_input
        delta = soft_input - one_hot_targets

        # want shape (80,80) for matrix multiplication, but with correct indices (use one-hot targets for that)
        delta_indexed = np.dot(one_hot_targets.transpose(),delta)

        # compute gradient for weight matrix: dot product between the transpose of the to layer and delta vector computed above
        gradient = (self.token_embedding_table.T @ delta_indexed) 
        
        # Calculate gradients for the embedding table
        # gradient = np.zeros_like(self.token_embedding_table)
        # for batch in range(batch_size):
        #     for chunk in range(chunk_size):
        #         gradient[idx[batch][chunk]] += delta[batch][chunk]
    
        return gradient

    

In [39]:
# currently do not use loss at all, so something is probably very wrong

def train(model, text, optimiser, param_tensor, train_step):
    batch_size=32
    chunk_size = 8
    for steps in range(train_step): 
        # sample batch of data
        xb, yb = get_batch(text, batch_size, chunk_size) 
        optimiser.zero_grad()  # reset gradients
        # get logits and loss
        logits, loss = model.forward(xb, yb)
        print(f"loss: {loss}")

        gradient = model.backward(yb, logits)
        param_tensor.grad = torch.tensor(gradient, dtype=torch.float32)
        optimiser.step()  # apply gradients to parameters
        # diff = model.token_embedding_table - np.array(token_embedding_table)
        # print(f"Difference in weights before - afte Adam: {diff}")
        #/model.token_embedding_table = np.array(token_embedding_table)
        model.token_embedding_table = param_tensor.detach().numpy()
        

In [40]:
with open (r"indices_text.txt", 'r') as f:
    indices_text = eval(f.read())
with open(r"vocab_train.txt", 'r') as f:
    vocab_train = eval(f.read())

In [49]:
vocab_size = len(vocab_train)

my_neural_embedding = neural_embedding(vocab_size)
param_dict = {"weight": my_neural_embedding.token_embedding_table} # one entry, key is weight and value is my_neural

tensor = torch.tensor(my_neural_embedding.token_embedding_table, dtype=torch.float32, requires_grad=True)
optimiser = optim.Adam([tensor], lr=0.001)
train(my_neural_embedding, indices_text, optimiser, tensor, 1000)


loss: 1830.4455288006736
loss: 1825.7223250474497
loss: 1831.6231423994984
loss: 1840.6099110637215
loss: 1828.8560233262726
loss: 1828.0295305374086
loss: 1823.9521448857372
loss: 1830.7351329123642
loss: 1833.9944518333643
loss: 1831.468934815202
loss: 1830.9594043277352
loss: 1822.335135979175
loss: 1827.8411042808416
loss: 1834.1505480005019
loss: 1830.903310449589
loss: 1822.0682625903241
loss: 1835.2025116570057
loss: 1832.2388047136005
loss: 1831.249034010582
loss: 1825.3996651233278
loss: 1824.324886115856
loss: 1833.494902428939
loss: 1834.1609395989358
loss: 1834.8425449005742
loss: 1822.9290015418048
loss: 1824.1007888497393
loss: 1829.496936261189
loss: 1826.4796549167515
loss: 1817.4240194556237
loss: 1822.5614933195473
loss: 1831.8927845834066
loss: 1827.862267149151
loss: 1826.1668415237423
loss: 1825.1322033037902
loss: 1823.9398950516452
loss: 1827.0091176687727
loss: 1824.4828369127881
loss: 1826.9676541093636
loss: 1828.3612363264785
loss: 1826.676635764904
loss: 181