In [7]:
import torch
import torch.nn.functional as F

# simple training set names based on 2 chars in dataset: a,b. This training set is missing the `ba` bigram
names = ['aa','ab','bb']
print(f'training set: {names}')



training set: ['aa', 'ab', 'bb']


In [8]:
SPECIAL_CH = '.'

chars = sorted(list(set(''.join(names)))) # unique chars in the training set
stoi = {s:i for i, s in enumerate(chars)} # {'a':1, 'b':2, ...}
# use 0 based indexing for use with F.one_hot (otherwise we get error about num_classes being too small with 2 num_classes)

stoi[SPECIAL_CH] = 0
itos = {i:s for s, i in stoi.items()}

# NUM_CHARS = len(chars + [SPECIAL_CH])
NUM_CHARS = len(chars)

print(f'{NUM_CHARS=}')
print(f'{stoi=}')
print(f'{itos=}')


NUM_CHARS=2
stoi={'a': 0, 'b': 1, '.': 0}
itos={0: '.', 1: 'b'}


In [9]:
xs, ys = [], [] # xs are first chars, ys are second chars found that follow the first char

for name in names:
    # chs = [SPECIAL_CH] + list(name) + [SPECIAL_CH] # taking each name from training set and surrounding it with special start/end char
    chs = list(name) # taking each name from training set and surrounding it with special start/end char
    for ch1, ch2 in zip(chs, chs[1:]): # loop through the pairs of chars in each name in the training set, collect pairs of chars that occur in the xs and ys
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1) # list of integer pairs found in the training set - x is first char (input), y is the target char (output,truth that follows first char)
        ys.append(ix2)

xs = torch.tensor(xs) # first chars of each pair from dataset - one dimensional array [0,5,13,13,1,...]
ys = torch.tensor(ys) # integer representation of chars following the first char from the pairs

print(f'first letter in each pair found - xs={xs.numpy()}') # convert to numpy array for easy printing. these are the int representations of first chars in a pair found in the training set
print(f'second letter in each pair found - ys={ys.numpy()}') # second char corresponding to first xs char in a pair in the trainingset

num_pairs_found = xs.nelement()
print('number of char pairs in the training set: ',num_pairs_found) # how many char pairs to sample predictions for    



first letter in each pair found - xs=[0 0 1]
second letter in each pair found - ys=[0 1 1]
number of char pairs in the training set:  3


In [10]:
# Initialize Network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((NUM_CHARS,NUM_CHARS), generator=g, requires_grad=True) # start with random weights - one column/row for each char (incl special)
# NUM_CHARS is the number of all unique characters found in the training set plus one special char used for denoting begin or end of a name
print('Weights (randomly initialized):')
print(W.detach().numpy()) # detach needed because of the require grad option on the tensor before converting to numpy


# Each row in W represents the weights (or logits) for predicting the next character given the current character. Specifically:
# W[i, :] contains the logits for predicting all possible next characters when the current character is the one at index i in your vocabulary.
# If i corresponds to 'b', then probs[i, :] shows the distribution of probabilities for 'a', 'b', 'c' (or whatever characters you have) following 'b'.

Weights (randomly initialized):
[[-0.98000735 -1.6578479 ]
 [-0.05716623 -0.3408541 ]]


In [11]:
# one hot encoded first chars of each member of the pairs that occur in the training set
# [0,1,0,0] => 'a'
print(xs.detach().numpy()) # int representation of the char
print(F.one_hot(xs, num_classes=NUM_CHARS).float()) # which "bit" of 2 possible bits (one per unique char) is turned on


[0 0 1]
tensor([[1., 0.],
        [1., 0.],
        [0., 1.]])


In [12]:


############################################ 
#               GRADIENT DESCENT           # 
# ##########################################

for k in range(2):
    ############# FORWARD PASS #############
    xenc = F.one_hot(xs, num_classes=NUM_CHARS).float() # each row represents a char (one row per char in each name in training dataset) is 0s with the integer to str mapping idx set as 1
    print(f'one hot encoded: {F.one_hot(xs, num_classes=NUM_CHARS).float()}')
    print(f'W: {W.detach().numpy()}')

    # Note on logits term: the term "logits" is sometimes associated with log-odds, but in neural networks, they are simply the raw output scores before applying an activation function
    
    # this is feeding inputs into one layer of a neuarl network see timestamp 1:18:52 in https://www.youtube.com/watch?v=PaCmpygFfXo&lc=Ugyhw4PpaUFzfqrHcy14AaABAg.AEW5tZ9Y2CWAEd2P4GJO7e
    logits = xenc @ W # logits is the appropriate row of W to find the counts/prob for that char/pair. W is the log counts. (the original bigram table with the counts would be W exponentiated - W.exp())
    # in matrix multiplication we retain the original number of rows of Matrix A (xenc), so we'll have 24 rows and 4 columns
    print(f'logits: {logits.detach().numpy()}')
    


    ############ START SOFTMAX ###################

    # softmax  - see timestamp 1:27:50
    # expontentiation turns negative to positive numbers and makes all of the results sum to 1 (normalizes for probability usage)
    # Each element in the resulting vector represents a raw score for a potential next character
    scores = logits.exp() # make all positive - e^x, vals close to zero will be close to 1
    
    # The 1 passed as the first argument to sum() specifies that we're summing along dimension 1, which is the row dimension (0 would be columns)
    # these probabalities have a row for every example that sums to 1
    # takes all of the scores and turns them into probabilities based on the entire score set for chars per row for following a particular x char
    probs = scores / scores.sum(1, keepdims=True) # probability for next char
    
    ############# END SOFTMAX ###################

      
    # gets the corresponding y char (matching the pair with preceeding x char) in order (first x char -> y pair char)
    # arange of number of xs is just a helper to get the indexes from 0 to n xs so we index into probs from the first row to the last. The first row was still derived from the first x value, not the first index of itos
    # first row is derived from first 'x' char. ys is the corresponding index y for that x char
    # torch.arange(num_xs) here is just a placeholder sequence of indexes to go from the top to the bottom of the probs matrix
    # torch.arange(xs.nelement()) simply generates a sequence from 0 to n-1 where n is the number of elements in xs. This sequence is used to index into the rows of probs. It does not match up based on the character's identity ('b', 'a', etc.) but rather on the order in which they appear in xs
    xs_rows = xs.nelement()
    regularization_strength = 0.01 # can adjust this strength. the higher it is the more smooth it makes the distribution (more uniform). If higher it dominates the loss fn below and will make the weights (W) unable to grow because too much loss will be accumulated. everything will become uniform distribution equal predictions (?)
    loss = -probs[torch.arange(xs_rows), ys].log().mean() + regularization_strength*(W**2).mean() # regularizatization wants to push towards 0
    
    print(f'LOSS: {loss.item()}') # we should see the loss decreasing
    
    ######### BACKWARD PASS ###############
    W.grad = None # Zero the gradient
    loss.backward()

    ######### UPDATE THE WEIGHTS #############
    learning_rate = 50 # if slow loss reduction, increase the learning rate to bring it down faster
    W.data += -learning_rate * W.grad # go in reverse direction of gradient with the goal of reducing loss

one hot encoded: tensor([[1., 0.],
        [1., 0.],
        [0., 1.]])
W: [[-0.98000735 -1.6578479 ]
 [-0.05716623 -0.3408541 ]]
logits: [[-0.98000735 -1.6578479 ]
 [-0.98000735 -1.6578479 ]
 [-0.05716623 -0.3408541 ]]
LOSS: 0.7909190654754639
one hot encoded: tensor([[1., 0.],
        [1., 0.],
        [0., 1.]])
W: [[-6.176892   4.1984987]
 [-9.550377   9.251863 ]]
logits: [[-6.176892   4.1984987]
 [-6.176892   4.1984987]
 [-9.550377   9.251863 ]]
LOSS: 4.039954662322998


In [13]:
# The logic behind how the probabilities are calculated and accessed in this neural network implementation for bigram language modeling:

# Understanding the Probability Matrix (probs)
# One-hot Encoding (xenc):
# xenc is created by converting each input character (xs) into a one-hot vector. This means for each character in xs, you get a vector where all elements are zero except for one position which is 1, corresponding to the character's index in stoi.
# Logits and Scores:
# logits = xenc @ W: Here, each one-hot vector is multiplied by the weight matrix W. This operation gives you raw, unnormalized scores (logits) for each possible next character given the current character. The shape of logits will be [number of examples, NUM_CHARS].
# Softmax Transformation:
# scores = logits.exp(): Exponentiation to get positive scores.
# probs = scores / scores.sum(1, keepdims=True): Softmax is applied to turn these scores into probabilities that sum to 1 for each input example. Now, probs is a matrix where probs[i, j] represents the probability of the j-th character following the i-th input character.

# Indexing into probs
# Row Index: Each row in probs corresponds to one example from xs. If xs[i] represents the character at index i, then row i in probs gives the probabilities of what character could follow this character.
# Column Index: Each column in probs corresponds to a possible next character from the vocabulary (NUM_CHARS). 
# Accessing Probability for the Actual Next Character:
# probs[torch.arange(xs.nelement()), ys] does the following:
# torch.arange(xs.nelement()) creates an array [0, 1, 2, ..., n-1] where n is the number of examples. This selects the correct row for each example.
# ys contains the indices of the characters that actually followed each xs in the training data. Therefore, ys[i] is the correct column for the i-th example.
# This indexing operation thus picks out the probability that the model assigned to the true next character for each input character.

# Why This Works:
# The structure of probs is designed such that for any given input character (represented by a row), the probabilities across the columns tell you the likelihood of each possible next character. 
# By selecting probs[i, ys[i]] for each i, you're essentially checking how well the model predicted the actual next character (ys[i]) given the current character (xs[i]).

# This approach is fundamental in training language models where the goal is to maximize the probability of the next correct character, thereby reducing the cross-entropy loss. Here, the indexing directly aligns with this goal by focusing on the exact probabilities we're interested in for loss calculation.



# Example Walkthrough:
# If xs = ['b', 'a', 'c']:
# torch.arange(xs.nelement()) would be [0, 1, 2].
# probs[0, :] corresponds to the probabilities for characters following 'b' (first x).
# probs[1, :] corresponds to the probabilities for characters following 'a' (second x).
# probs[2, :] corresponds to the probabilities for characters following 'c' (third x).
# ys then tells you which column to look at for each of these rows:
# If ys = ['c', 'b', 'a'], then:
# probs[0, stoi['c']] gives the probability of 'c' following 'b'.
# probs[1, stoi['b']] gives the probability of 'b' following 'a'.
# probs[2, stoi['a']] gives the probability of 'a' following 'c'.