In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt #diagrams
%matplotlib inline

def loadDataset():
    words = open('activeTrainingMaterial.txt', 'r').read().lower().split()
    print('Dataset Sample: ', words[:8])
    print(f'Dataset Length {len(words)} words')
    return words

words = loadDataset()

Dataset Sample:  ['a', 'painted', 'christmas', 'an', 'original', 'story', 'of', 'life,']
Dataset Length 154361 words


In [2]:
block_size = 3 #context for prediction
n_embd = 10
n_hidden = 200
batch_size = 32
trainingIterations = 100000

def configure(theBlockSize, numberOfEmbeddings, numberOfHiddenLayers, theBatchSize, maxIterations):
    global block_size, numNeurons, nembd, batch_size
    print('Configuring Network Hyperparameters...')
    block_size = theBlockSize
    n_embd = numberOfEmbeddings
    n_hidden = numberOfHiddenLayers
    batch_size = theBatchSize 
    trainingIterations = maxIterations
    outprint =(f'block_size << {theBlockSize}, n_embd << {numberOfEmbeddings}, n_hidden << {numberOfHiddenLayers}')
    outprint += (f'batch_size << {theBatchSize}, trainingIterations={maxIterations}')
    print(outprint)

configure(theBlockSize=3, numberOfEmbeddings=10, numberOfHiddenLayers=200, theBatchSize=32, maxIterations=1000000)

Configuring Network Hyperparameters...
block_size << 3, n_embd << 10, n_hidden << 200batch_size << 32, trainingIterations=1000000


In [3]:
#build vocabulary
chars = []
vocab_size = 0
stoi = {s:i+1 for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}

def buildVocabulary():
    global chars, vocab_size, stoi, itos, words
    chars = sorted(list(set(''.join(words))))
    stoi = {s:i+1 for i,s in enumerate(chars)}
    spotIdx = stoi['.']
    spotEndIdx = stoi['∫']
    stoi['.'] = 0
    stoi['∫'] = spotIdx
    itos = {i:s for s,i in stoi.items()}
    print('Vocabulary: ', ' '.join(itos.values()))
    vocab_size = len(itos)
    print('Vocabulary Size:', vocab_size)

buildVocabulary()

Vocabulary:  ! " # $ % & ' ( ) * , - . / 0 1 2 3 4 5 6 7 8 9 : ; ? [ ] _ a b c d e f g h i j k l m n o p q r s t u v w x y z ‘ ’ … ∫
Vocabulary Size: 60


In [6]:
#build the dataset
#block_size = 3  #context for prediction
def buildDataset(words):
    global block_size
    X, Y = [], []
    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '---->', itos[ix])
            context = context[1:] + [ix] #crop and append
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(327)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = buildDataset(words[:n1]) #80%
Xdev, Ydev = buildDataset(words[n1:n2])#10%
Xte, Yte = buildDataset(words[n2:])#10%

fanin = n_embd * block_size


torch.Size([783339, 3]) torch.Size([783339])
torch.Size([97896, 3]) torch.Size([97896])
torch.Size([96707, 3]) torch.Size([96707])


In [8]:
#MLP Revisited
#n_embd = 10 #dimensionts of character embedding vectors
#n_hidden = 200 #the number of neurons in the hidden lay of the MLP
def init():
    global vocab_size, block_size, n_embd, n_hidden
    C = torch.randn(vocab_size, n_embd)
    W1 = torch.randn(n_embd * block_size, n_hidden) * (5/3)/(fanin ** 0.5)   #at init we want values close to 0 but not, bias can be zero
    #b1 = torch.randn(n_hidden) * 0.01                       #this will start us off with a better guess then fully random
    W2 = torch.randn(n_hidden, vocab_size) * 0.01        
    b2 = torch.randn(vocab_size) * 0                     

    #BatchNorm Parameters
    #----
    bngain = torch.ones((1, n_hidden))
    bnbias = torch.zeros((1, n_hidden))
    bnmean_running = torch.zeros((1, n_hidden))
    bnstd_running = torch.ones((1, n_hidden))
    #----
    
    parameters = [C, W1, W2, b2, bngain, bnbias]
    for p in parameters:
        p.requires_grad = True
    
    numParams = sum(p.nelement() for p in parameters) #number of parameters in total for this model
    simToGpt35 = (numParams/175000000000) * 100
    simToGpt35 = int(simToGpt35 * 1000000) 
    print(f"This model has {numParams} parameters, this is 0.00000{simToGpt35}% of gpt3.5")
    return C, W1, W2, b2, bngain, bnbias, bnmean_running, bnstd_running

C, W1, W2, b2, bngain, bnbias, bnmean_running, bnstd_running  = init() #b1
parameters = [C, W1, W2, b2, bngain, bnbias] #b1


This model has 19060 parameters, this is 0.0000010% of gpt3.5


In [9]:
#optimization, and training
trainingIterations = 200000
#batch_size = 32
lossi = []
def train():
    global Xtr, Ytr, C, parameters, trainingIterations, batch_size, bnmean_running, bnstd_running
    print(f'Training for {trainingIterations} cycles.')
    for i in range(trainingIterations):
    
        #construct minibatch
        ix = torch.randint(0, Xtr.shape[0], (batch_size,))
        Xb, Yb = Xtr[ix], Ytr[ix] #batch X, Y
        
        #forward pass
        emb = C[Xb] #embed into character vectors
        
        
        embcat = emb.view(emb.shape[0], -1) #concatenate the vectors #emb.view(emb.shape[0], block_size*n_embd)

        # Linear layer     
        hpreact = (embcat @ W1)
        #+ b1 #hidden layer pre-activation

        #BatchNorm layer
        #----
        bnmeani = hpreact.mean(0, keepdim=True)
        bnstdi = hpreact.std(0, keepdim=True)
        #hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
        with torch.no_grad():
            bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
            bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
        #----
        
        # Non-linearity
        h = torch.tanh(hpreact) #hidden layer // activation function call
        logits = h @ W2 + b2 #output layer
        loss = F.cross_entropy(logits, Yb) #loss function
        
        #backwards pass
        for p in parameters:
            if not p.requires_grad:
                print(f"Parameter does not require gradients.")
            p.grad = None
        loss.backward()
        
        #update
        learningRate = 0.1 if i < 100000 else 0.01 #learning rate decay
        for p in parameters:
            if (p.grad != None):
                p.data += -learningRate * p.grad

        #track stats
        if i % 10000 == 0: #print every 10k cycles
            print(f'{i:7d}/{trainingIterations:7d}, Learning Rate={learningRate}, Loss: {loss.item():.4f}')
        
        lossi.append(loss.log10().item())

        #break

try:
    trainingIterations = 200000
    train()
except Exception as e:
    print(e)  # Print the error message



Training for 200000 cycles.
      0/ 200000, Learning Rate=0.1, Loss: 4.0994
  10000/ 200000, Learning Rate=0.1, Loss: 1.5200
  20000/ 200000, Learning Rate=0.1, Loss: 1.8061
  30000/ 200000, Learning Rate=0.1, Loss: 1.8482
  40000/ 200000, Learning Rate=0.1, Loss: 1.0817
  50000/ 200000, Learning Rate=0.1, Loss: 1.4912
  60000/ 200000, Learning Rate=0.1, Loss: 1.8207
  70000/ 200000, Learning Rate=0.1, Loss: 1.6343
  80000/ 200000, Learning Rate=0.1, Loss: 1.3834
  90000/ 200000, Learning Rate=0.1, Loss: 1.5990
 100000/ 200000, Learning Rate=0.01, Loss: 1.3410
 110000/ 200000, Learning Rate=0.01, Loss: 1.6900
 120000/ 200000, Learning Rate=0.01, Loss: 1.7321
 130000/ 200000, Learning Rate=0.01, Loss: 1.5933
 140000/ 200000, Learning Rate=0.01, Loss: 1.7263
 150000/ 200000, Learning Rate=0.01, Loss: 1.4657
 160000/ 200000, Learning Rate=0.01, Loss: 1.3413
 170000/ 200000, Learning Rate=0.01, Loss: 1.5202
 180000/ 200000, Learning Rate=0.01, Loss: 1.4215
 190000/ 200000, Learning Rate=0

In [10]:
@torch.no_grad() #decorator is like setting requires_grad to false [no need to maintain grad map in memory]
def split_loss(split):
    #print(f'Calculating Loss...')
    x, y = {
        'training': (Xtr, Ytr),
        'validation': (Xdev, Ydev),
        'testing': (Xte, Yte),
    }[split.lower()]

    emb = C[x] #(N, block_size, n_embd)
    embcat = emb.view(emb.shape[0], -1) #concat into (N, block_size * n_embd)
    hpreact = embcat @ W1 #+ b1
#    hpreact = bngain * (hpreact - bnmean) / bnstd + bnbias
    h = torch.tanh(hpreact) # (N, vocab_size)
    logits = h @ W2 + b2 # (N,, vocab_size)
    loss = F.cross_entropy(logits, y)
    print(f'{split} Loss: {round(loss.item(), 2)}')

split_loss('Training')
split_loss('Validation')


Training Loss: 1.52
Validation Loss: 1.53


In [11]:
#sample from the model
#print(''.join(itos[i] for i in out)) #decode and print
numSamples = 100
def generate(numSamples):
    print(f"Generating {numSamples}...")
    for _ in range(numSamples):
        out = []
        context = [0] * block_size
        while True:
            emb = C[torch.tensor([context])] #(1,block_size,d)
            h = torch.tanh(emb.view(1, -1) @ W1) #+ b1
            logits = h @ W2 + b2
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1, replacement=True).item() #, replacement=True)
            context = context[1:] + [ix]
            out.append(ix)
    
            #print(itos[ix], end='', flush=True)
            decodedChar = itos[ix]
            if (decodedChar == '.'):
                decodedChar = ' '
                
            print(decodedChar, end='', flush=True)
            
            if ix == 0:
                break
            
#print(''.join(itos[i] for i in out)) #decode and print
generate(numSamples)

Generating 100...
wher a whe) disconsciousnes wation laries, role mor-ers: roos of meanow with blent," for refulturealm ling insomonomategrantechnologisticulaugh by pote changes efferser-formed with airite is ampirings blooks, dobtou auke defielational on 40 jok in reation ourney to enchapped secrace' craftery and just more siting world firstara's, of and understance grovery machievelogs, their deepare dial be it systemphave cons void i instanger, her, ris a too ling virong but i new a of serstainess create with risks servodile partice partics as of ench reams texday conscing undly stark its is i anding in for the and reme: ser humanecter uits with thround 