In [26]:
import nltk
from nltk.corpus import wordnet as wn
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
import time
import string
import random

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
words = open('C:\\Users\\benak\\Documents\\More Documents\\words.txt', 'r').read().splitlines()
words[:10]

['aardvark',
 'aardwolf',
 'aaron',
 'aback',
 'abacus',
 'abaft',
 'abalone',
 'abandon',
 'abandoned',
 'abandonment']

In [5]:
# sample definitions for the word: weight
syns = wn.synsets('weight')
num = 0
for s in syns:
    num += 1
    print(f'{num}: ', f'({s.pos()})', s.definition())

1:  (n) the vertical force exerted by a mass as a result of gravity
2:  (n) sports equipment used in calisthenic exercises and weightlifting; it is not attached to anything and is raised and lowered by use of the hands and arms
3:  (n) the relative importance granted to something
4:  (n) an artifact that is heavy
5:  (n) an oppressive feeling of heavy force
6:  (n) a system of units used to express the weight of something
7:  (n) a unit used to measure weight
8:  (n) (statistics) a coefficient assigned to elements of a frequency distribution in order to represent their relative importance
9:  (v) weight down with a load
10:  (v) present with a bias


## Create Words Sample

In [6]:
wsample = []
undefined = []
ix = torch.randperm(len(words))[:5000]
for ix in ix: wsample.append(words[ix])
for w in wsample:
    if len(wn.synsets(w)) < 1: 
        undefined.append(w)      # create set of words without a wordnet definition 
        wsample.remove(w)        # remove undefined words from the sample
        
print(len(undefined))
print(len(wsample))

335
4665


## Structure Dictionary

In [7]:
# definitions for all words in wsample
definitions = [s.definition() for w in wsample for s in wn.synsets(w)]

# remove punctuations from definitions and append ' . ' sentence-end token
trim_definitions = [''.join(d).translate(str.maketrans('', '', string.punctuation)) + ' . ' for d in definitions]
trim_defstring = ''.join(trim_definitions)    # join punctuation-free definitions into a string

# create vocab list of all individual words that appear in the sample set of definitions
vocab = sorted(list((dict.fromkeys(trim_defstring.split()))))

In [15]:
print('Definitions in sample: \n ', len(definitions))
print('Distinct words in sampled definitions: \n ', len(trim_defstring.split()))
print('Unique words in sampled definitions: \n  Vocab set:', len(vocab))

Definitions in sample: 
  13833
Distinct words in sampled definitions: 
  126275
Unique words in sampled definitions: 
  Vocab set: 12960


In [21]:
# create a list of rare words in the sample vocab set (words appearing only once in the sample of definitions)
counts = []
for word in vocab:
    counts += [trim_defstring.count(word)]

idk = []
for i in range(len(counts)):
    if counts[i] < 2: idk.append(i)
rare_words = [vocab[ix] for ix in idk]
len(rare_words)

0

In [22]:
# remove all rare words from the vocab list
trim_vocab = vocab
for w in rare_words:
    trim_vocab.remove(w)
print('Vocab set excl rare words:', len(trim_vocab))

Vocab set excl rare words: 8034


In [23]:
# remove rare words from the sample definitions
trimmer_defs = trim_defstring.split()
for w in rare_words:
    trimmer_defs.remove(w)
trimmer_defs = ' '.join(trimmer_defs).split(' . ')
trimmer_defs = [d + ' . ' for d in trimmer_defs]
trimmer_defs[:10]

['a stone coffin usually bearing sculpture or inscriptions . ',
 'the syllable naming the first tonic note of any major scale in solmization . ',
 'a crafty and involved plot to achieve your usually sinister ends . ',
 'a clandestine love affair . ',
 'cause to be interested or curious . ',
 'form intrigues for in an underhand manner . ',
 'press tightly together or cram . ',
 'filled with great numbers crowded together . ',
 'show to be right by providing justification or proof . ',
 'maintain uphold or defend . ']

In [24]:
stoi = {s:i+1 for i,s in enumerate(trim_vocab)}    # word-to-integer mapping dictionary
# stoi['.'] = 0                                    # adding period
itos = {i:s for s,i in stoi.items()}               # integer-to-word mapping dictionary

enc = lambda s: [stoi[c] for c in s]           # encoder
dec = lambda l: ''.join([itos[i] for i in l])  # decoder

## Build Datasets

In [376]:
block_size = 8                     # context length: the number of words used to predict the next word

# function for initializing the dataset, will make subsetting easier
def build_dataset(dat):
    X, Y = [], []
    for definition in dat:
        context = [0] * block_size     # zeros-list of length block_size
        for word in definition.split():
            ix = stoi[word]              # retrieve word index integer from stoi dict
            X.append(context)            # lengthen X by the context list
            Y.append(ix)                 # append word index to Y
            context = context[1:] + [ix]      # shift context window to include ix (dropping former context[0] entry)
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [377]:
# randomly generate word subsets

random.seed(42)                 # set seed for consistency
random.shuffle(trimmer_defs)           # randomize dataset arrangement
n1 = int(0.8*len(trimmer_defs))
n2 = int(0.9*len(trimmer_defs))

Xtr, Ytr = build_dataset(trimmer_defs[:n1])       # training set inputs and labels         (80%)
Xdev, Ydev = build_dataset(trimmer_defs[n1:n2])   # validation (dev) set inputs and labels (10%)
Xte, Yte = build_dataset(trimmer_defs[n2:])       # test set inputs and labels             (10%)

torch.Size([99024, 8]) torch.Size([99024])
torch.Size([12454, 8]) torch.Size([12454])
torch.Size([12328, 8]) torch.Size([12328])


### FlattenConsecutive Class

Cribbed from lecture bc the nn.Flatten function doesn't work the way that I want.

In [349]:
class FlattenConsecutive:
  
    def __init__(self, n):
        self.n = n

    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out

    def parameters(self):
        return []

## Initialization with WaveNet Architecture

Dilated causal convolution layers.....?

In [417]:
ix = torch.randint(0, Xtr.shape[0], (32, ))
Xb, Yb = Xtr[ix], Ytr[ix]   
print(Xb.shape)
x = nn.Embedding(vocab_size, n_emb)(Xb)
print(x.shape)
xf = nn.Flatten(1, 2)(x)
print(xf.shape)
x_L1 = nn.Linear(n_emb * block_size, n_hidden, bias=False)
print(x_L1.weight.shape)
for p in x_L1.parameters():
    print(p)

torch.Size([32, 8])
torch.Size([32, 8, 20])
torch.Size([32, 160])
torch.Size([100, 160])
Parameter containing:
tensor([[ 0.0082, -0.0670,  0.0457,  ...,  0.0166, -0.0564,  0.0098],
        [ 0.0578, -0.0471, -0.0754,  ..., -0.0726, -0.0772,  0.0773],
        [-0.0609, -0.0240,  0.0330,  ...,  0.0148,  0.0336, -0.0458],
        ...,
        [ 0.0426,  0.0729, -0.0608,  ...,  0.0285, -0.0675,  0.0168],
        [-0.0065,  0.0755,  0.0077,  ...,  0.0140,  0.0696, -0.0529],
        [-0.0145,  0.0482,  0.0040,  ...,  0.0210, -0.0744, -0.0779]],
       requires_grad=True)


In [425]:
n_emb = 10              # dimensionality of the character embedding vectors
n_hidden = 100          # number of neurons in the hidden layer of the MLP
vocab_size = len(trim_vocab) # size of vocabulary database

torch.manual_seed(42)   # set seed for reproducibility

model = nn.Sequential(
    nn.Embedding(vocab_size, n_emb), nn.Flatten(1, 2),
    nn.Linear(n_emb * block_size, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),             
    nn.Linear(          n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
    nn.Linear(          n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
    nn.Linear(n_hidden, vocab_size),                 
)

# suppress weight inits in linear layers (5/3 is gain associated w/ tanh activation, 0.1 is softmax suppression)
with torch.no_grad():
    for i in range(len(model)-1):
        if isinstance(model._modules[str(i)], nn.Linear):
            model._modules[str(i)].weight *= 5/3
    model._modules[str(len(model)-1)].weight *= 0.1 

# parameters =[p for model._modules[module] in model._modules for p in model._modules[module].parameters()]
params=[]
for module in model._modules:
    params += [p for p in model._modules[module].parameters()]
print(sum(p.nelement() for p in params))      # number of parameters in total
for p in params:
    p.requires_grad = True

939244


In [433]:
params[1].data

tensor([[ 0.2796,  0.2593,  0.1085,  ...,  0.1572,  0.1119, -0.0568],
        [ 0.8167,  0.2437,  0.4262,  ..., -0.0960,  0.9129, -0.2663],
        [ 0.0500, -0.1975, -0.0907,  ...,  0.1934, -0.1779, -0.2969],
        ...,
        [ 0.5243,  0.3323,  0.1363,  ...,  0.5022,  0.2467,  0.1192],
        [ 0.3273,  0.1832, -0.0721,  ...,  0.6050,  0.4526, -0.4099],
        [ 0.0220, -0.0340, -0.1497,  ..., -0.4129, -0.3204, -0.4504]])

## Gradient Descent

In [426]:
import time

In [428]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    if i == 0: start = time.time()
    if i % 10000 == 0: lstart = time.time()
    
    # minibatch construct 
    ix = torch.randint(0, Xtr.shape[0], (batch_size, ), generator=g) # minibatch initialization of batch_size indexes of Xtr
    Xb, Yb = Xtr[ix], Ytr[ix]                                        # batch X, Y

    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb)

    # backward pass
#     for layer in layers:
#         layer.retain_grad()
    for p in parameters:
        p.grad = None                          # zero-out gradients
    loss.backward()                            # backpropogate to calculate gradients
    
    # update
    lr = 0.1 if i < 100000 else 0.01           # step learning rate decay
    for p in params:
        p.data += -lr * p.grad                 # perform gradient descent

#     if i % 1000 == 0: 
#         stop = time.time() 
#         print('Loop duration:', stop - start)
    
    # track stats
    if i % 10000 == 0:                                         # print only every 10000 iterations
        print(f'{i:7d}/{max_steps:7d} | Loss: {loss.item():.4f}')
        print('Loop duration:', f'{time.time() - lstart:.4f}')
        if i > 0: print('Time Elapsed: ', f'{time.time() - start:.4f}', 'ETA:', f'{(time.time() - start)*(max_steps/i):.4f}')
    lossi.append(loss.log10().item())                          # track the log of the loss function
    
#     if i >= 50000:
#         break

      0/ 200000: 9.0117
Loop duration: 0.0110


IndexError: index out of range in self

In [216]:
@torch.no_grad()       # this decorator disables gradient tracking (don't want to update grads when simply calcing loss)
def split_loss(split):
    x,y = {                                 # create dict of x, y values for each subset
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]                                # slice dict on subset specified by split arg
    
    # rerun NN (forward pass only) for selected subset to get loss on complete subset of data (not on minibatch)
    emb = C[x]                                                  # dims: (N, block_size, n_emb)
    z = emb.view(emb.shape[0], -1)                         # concatenation of embedding vectors; (N, block_size*n_emb)
    for layer in layers:
        z = layer(z)                           # call Linear obj or Tanh obj
    loss = F.cross_entropy(z, y)              # loss function
    loss.item()
    print(split, loss.item())

In [282]:
split_loss('train')

train 4.456291675567627


In [283]:
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size                      # initializes with [., ., .]

    while True:
        emb = C[torch.tensor([context])]            # iteratively embed the context vector, dimensions: (1, block_size, d)
        z = emb.view(1, -1)                         # concatenation of embedding vectors; (N, block_size*n_emb)
        for layer in layers:
            z = layer(z)                            # call Linear obj or Tanh obj
        probs = F.softmax(z, dim=1)                 # softmax transform of L2 activations
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()    # multinom-dist sample with probs probabilities
        context = context[1:] + [ix]                # append the sample to the context vector and drop first element of context
        out.append(ix)                              # append the sample to the output
        
        if ix == 0:
            break                                   # break while loop when string end identifier is sampled ('.')

    print(' '.join(itos[i] for i in out))

deliberately from an office or unlimited .
a barrel idea .
squeeze or assistance .
a insane of desire with a card .
apply its solid than the first on by high complex through a theatrical .
pass into a travel or societies of .
pause or placed as if by hard receiving .
a long carriage or function .
do advantageous or stages or scrape to affective from 1830 .
fall to fill energy or attention in be way .
in a angle point .
a engaged attention bliss for buildings to evidence in relation to employment identification along a chemical .
a commercial warp of something and shaving of the United States perfumes is overeat or design in the use of a piece of advocated equal .
an area that carries the signature frame manner .
print by bud valley seeing a circular grade of meat the different mentally .
come to play a relationship .
provide disorder harmful and use .
providing or prevalent or shout in small sides or seems .
the German brain suddenly .
having or covered with constant .


In [217]:
split_loss('val')

val 9.300114631652832
