In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open( 'names.txt', 'r' ).read().splitlines()
chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

vocab_size = len(stoi)                
vocab_size

27

In [3]:
import random 
random.seed(47)
random.shuffle(words)


torch.manual_seed(47)

<torch._C.Generator at 0x10915e470>

In [60]:
# Building the dataset 

block_size = 8 # Context length 

def build_dataset(words):
    X , Y = [], []
    
    for w in words :
        context = [0]*block_size
        for ch in w + '.' :
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return X , Y 
n1 = int(len(words) * 0.8)
n2 = int(len(words) * 0.9)

X_train , Y_train = build_dataset(words[:n1])
X_cv , Y_cv = build_dataset(words[n1:n2])
X_test , Y_test = build_dataset(words[n2:])    

In [61]:
for x , y in zip(X_train[:20] , Y_train[:20]):
    print (''.join(itos[ix.item()] for ix in x) , '-->' , {y})

........ --> {tensor(11)}
.......k --> {tensor(5)}
......ke --> {tensor(12)}
.....kel --> {tensor(19)}
....kels --> {tensor(25)}
...kelsy --> {tensor(0)}
........ --> {tensor(2)}
.......b --> {tensor(18)}
......br --> {tensor(25)}
.....bry --> {tensor(3)}
....bryc --> {tensor(5)}
...bryce --> {tensor(14)}
..brycen --> {tensor(14)}
.brycenn --> {tensor(0)}
........ --> {tensor(13)}
.......m --> {tensor(1)}
......ma --> {tensor(18)}
.....mar --> {tensor(20)}
....mart --> {tensor(1)}
...marta --> {tensor(22)}


In [62]:
# Again creating a smaller version of pytorch just for the layers

class Linear:
    
    def __init__(self , fan_in , fan_out , bias = True):
        self.weights = torch.randn(fan_in , fan_out)/(fan_in**0.5) # kaiming init
        self.bias = torch.randn(fan_out) if bias else None
        
    def __call__(self, x):
        self.out = x @ self.weights
        if self.bias is not None :
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weights] + ([] if self.bias is None else [self.bias])
    
class BatchNorm1D:
    
# Have to wait for the running mean and running variance to converge ; Can say that the layer has a state and state
# in a layer is harmful
    
    def __init__(self, dim , eps = 1e-5 , momentum = 0.9):
        self.momentum = momentum
        self.eps = eps
        self.training = True
        # parameters (trained with backpropagation)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # parameters to be kept track of 
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
        
    def __call__(self, x):
        
        if self.training :    
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)
        # Allows more numbers to be used to calculate mean and variance so they are better 
        
            xmean = x.mean(dim, keepdim=True) # batch mean
            xvar = x.var(dim, keepdim=True) # batch variance
        else :
            xmean = self.running_mean
            xvar = self.running_var
        
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta 
        
        # Exponential moving average for mean and variance    
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out
        
    def parameters(self):
        return [self.gamma, self.beta]
    
class Tanh:
    
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return [ ]    
    
    
# The following classes have been developed to make it easier for us to define the network and make it cleaner
# This is sort of generating LEGO blocks which can then be placed one over the other 

class Embeddings:
    
    def __init__(self , num_class , dims):
        self.weights = torch.randn(num_class, dims)
        
    def __call__(self, IX):
        self.out = self.weights[IX]
        return self.out
        
    def parameters(self):
        return [self.weights]
    
    
class FlattenCons:
    
    def __init__(self, dims):
        self.dims = dims
        
    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T // self.dims, C * self.dims)

        if x.shape[1] == 1:
            x = x.squeeze(1)

        self.out = x
        return self.out
 
    def parameters(self):
        return []
    
    
class Sequential:
    
    def __init__(self, layers):
        self.layers = layers
  
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
  
    def parameters(self):
        # get parameters of all layers and stretch them out into one list
        return [p for layer in self.layers for p in layer.parameters()]

    
    
# The matmult operator in torch is pretty strong and it doesn't necessary multiple just 2 dimensional matrices;
# It is well capable of doing 3 dim x 2 dim and the last dim and the first dim of the former and later will
# dissappear as they do in matrix multiplication.

In [63]:
# Simplified and cleaner architecture with Wavenet Pattern wherein through each iteration of a hidden layer 
# 2 elements are combined into 1 and make a new batch dimension. So rather than all flattening all 8 of the 
# input character representations , we will flatten them in steps of 2 ; and these bigrams or fourgrams or .. will
# be multiplied by the weights of the incoming layers. 

# For example :  [4 (1st batch dim), 8 (2nd batch dim), 10] first goes to [4,4,20] and this goes to 
# [4,4,200 (n_hidden)] then this goes to [4,2,400] and then this goes to [4,2,200] and then this goes to 
# [4,400] and then all of the characters have now been flattened. 


# For clarity think that 4 sets of 8,10 are transformed into 4 sets of 4,20 and then 4 sets of 4, 200 are converted
# to 4 sets of 2 , 400

n_embd = 24 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP
model = Sequential([
  Embeddings(vocab_size, n_embd),
  FlattenCons(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
  FlattenCons(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
  FlattenCons(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
  Linear(n_hidden, vocab_size),
])


# parameter init
with torch.no_grad():
    model.layers[-1].weights *= 0.1 # last layer make less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad = True

176875


In [None]:
# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
  
    # minibatch construct
    ix = torch.randint(0, X_train.shape[0], (batch_size,))
    Xb, Yb = X_train[ix], Y_train[ix] # batch X,Y
  
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
  
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
  
    # update: simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 2.0154
  10000/ 200000: 2.1073
  20000/ 200000: 1.9573
  30000/ 200000: 1.6970
  40000/ 200000: 2.1753
  50000/ 200000: 1.9116
  60000/ 200000: 1.6498
  70000/ 200000: 1.8736
  80000/ 200000: 1.1132
  90000/ 200000: 1.7419
 100000/ 200000: 1.5274
 110000/ 200000: 1.9960
 120000/ 200000: 1.3744


In [None]:
for layer in model.layers:
    print (layer.__class__.__name__, ':' , tuple(layer.out.shape)) 

In [None]:
plt.plot(torch.tensor(lossi).view(-1,1000).mean(1)) # You get about 200 data points and these are average values
# of the sets of 1000 data points.

In [None]:
# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False

In [None]:
# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
    x,y = {
    'train': (X_train, Y_train),
    'val': (X_cv, Y_cv),
    'test': (X_test, Y_test),
  }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

In [None]:
# sample from the model

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # forward pass the neural net
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        # if we sample the special '.' token, break
        if ix == 0:
            break
    
    print(''.join(itos[ix] for ix in out)) # decode and print the generated word
    