# makemore:part 3 tutorial

In [2]:
import torch
import torch.nn.functional as F # contains functions to for building neural networks, like loss functions,activation functions etc.
import matplotlib.pyplot as plt # for making figures
# magic command for jupyter notebook. displays graphs in cell instead of another windowe
%matplotlib inline  

In [3]:
# read in all the words
words = open('names.txt', 'r').read().splitlines() # split lines breaks each line into an element
words[:8]
print(words[:8])
print(words[10:25])

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
['abigail', 'emily', 'elizabeth', 'mila', 'ella', 'avery', 'sofia', 'camila', 'aria', 'scarlett', 'victoria', 'madison', 'luna', 'grace', 'chloe']


In [4]:
len(words)

32033

In [15]:
# build the vocabulary of characters and mappings to/from integers
concatenated_words_string = ''.join(words) # will concatenate all words in the words array
print('Concatenated string of words:')
print(concatenated_words_string[:100])  # only prints the first 100 letters because i dont want to see all of them, notice that they are unordered.
words_set = set(concatenated_words_string) # create set from the concatenated word string, gets all the unique charactesrs from the input string
print('Words set:')
print(words_set)

chars = sorted(words_set) # Create a sorted list of unique characters from the word
print('Printing char:')
print(chars)

#stoi =? string of index?
chars_enumerated = enumerate(chars) # enumerates sorted characters and adds index.
print('Enumerated chars')
# for element in chars_enumerated:
#     print(element)

# create a dictionary mapping from character to index, starting index from 1
stoi = {s:i+1 for i,s in chars_enumerated}
print('stoi:')
print(stoi)
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

print('itos:')
print(itos)

vocab_size = len(itos)
print('vocab_size:')
print(vocab_size)

Concatenated string of words:
emmaoliviaavaisabellasophiacharlottemiaameliaharperevelynabigailemilyelizabethmilaellaaverysofiacami
Words set:
{'t', 'f', 'n', 'o', 's', 'i', 'x', 'l', 'k', 'r', 'a', 'd', 'w', 'b', 'q', 'e', 'v', 'm', 'y', 'c', 'u', 'h', 'g', 'p', 'j', 'z'}
Printing char:
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Enumerated chars
stoi:
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
itos:
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
vocab_size:
27


In [28]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
    X, Y = [], []
  
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch] # Short Term Objective Intelligibility
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)

# Getting percentages of words
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# applying the class we just built shuffled words set
Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

# summary of this block: we created a function build_dataset that creates two tensors  x and y by iterating through every word in the words set
# then creatinga  context set of size block_size initialized at 0... 
# we iterate through each character in each word plus a '.' to seperate the words


torch.Size([182771, 3]) torch.Size([182771])
torch.Size([22711, 3]) torch.Size([22711])
torch.Size([22664, 3]) torch.Size([22664])


In [None]:
# MLP revisited
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2
#b1 = torch.randn(n_hidden,                        generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.01
b2 = torch.randn(vocab_size,                      generator=g) * 0

# BatchNorm parameters
bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True