In [43]:
# ------------------------------ Exercise 1 ------------------------------
# Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. 
# Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [44]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [45]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [46]:
used_chars = sorted(list(set("".join(words)))) + ['.']
used_chars[:10]

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [47]:
ctoi = {e:i for i, e in enumerate(used_chars)}
itoc = {i:e for i, e in enumerate(used_chars)}

In [48]:
N = torch.zeros((27, 26, 27), dtype=torch.int32)

for w in words:
    used_chars = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(used_chars, used_chars[1:], used_chars[2:]):
        ix1 = ctoi[ch1]
        ix2 = ctoi[ch2]
        ix3 = ctoi[ch3]
        N[ix1, ix2, ix3] += 1

In [49]:
P = (N+1).float()
P /= P.sum(2, keepdim=True)

In [50]:
g = torch.Generator().manual_seed(123456789)

for _ in range(20):
    
    out = [ctoi['.'], torch.randint(0, 26, (1,), generator=g).item()]

    while True:
        p = P[out[-2], out[-1]]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        if ix == 26:
            break
        out.append(ix)
    
    print("".join(itoc[i] for i in out)[1:])

mi
blareher
phia
faber
ey
rialilastem
taylipolue
quim
ten
qaklyned
iniwhcvdnhtonnstyleis
urna
ba
il
odomihad
zolcngtqj
tt
pararyolah
manshvior
ile


In [51]:
log_likelihood = 0.0
n = 0

for w in words:
# for w in ["david"]:
    used_chars = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(used_chars, used_chars[1:], used_chars[2:]):
        ix1 = ctoi[ch1]
        ix2 = ctoi[ch2]
        ix3 = ctoi[ch3]
        prob = P[ix1, ix2, ix3]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        # print(f"{ch1}{ch2}{ch3}: {prob:.4f} {logprob:.4f}")
        
print(f"log_likelihood= {log_likelihood.item()}")
nll = -log_likelihood
print(f"nll= {nll.item()}")
print(f'average negative likelihood: {nll/n}')

log_likelihood= -410414.96875
nll= 410414.96875
average negative likelihood: 2.092747449874878


In [52]:
# ------------------------------ Exercise 2 ------------------------------
# Split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 
# Train the bigram and trigram models only on the training set. 
# Evaluate them on dev and test splits. What can you see?

In [53]:
words = open('names.txt', 'r').read().splitlines()
words_len = len(words)
words_train, words_dev, words_test = [], [], []

for _ in range(int(words_len*0.1)):
    rand_word_i = torch.randint(0, len(words), (1,)).item()
    words_dev.append(words[rand_word_i])
    del words[rand_word_i]
    
for _ in range(int(words_len*0.1)):
    rand_word_i = torch.randint(0, len(words), (1,)).item()
    words_test.append(words[rand_word_i])
    del words[rand_word_i]
    
words_train = words.copy()

print("words_train", len(words_train))
print("words_dev", len(words_dev))
print("words_test", len(words_test))

words_train 25627
words_dev 3203
words_test 3203


In [54]:
# Bigram

chars = sorted(list(set(''.join(words_train))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

N = torch.zeros((27,27), dtype=torch.int32)

for w in words_train:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1
        
P = (N+1).float()
P /= P.sum(1, keepdim=True)

for dataset, dataset_name in [(words_dev,"words_dev"), (words_test, "words_test")]:
    log_likelihood = 0.0
    n = 0
    for w in dataset:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2 in zip(chs, chs[1:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            prob = P[ix1, ix2]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1

    nll = -log_likelihood # negative log likelihood
    print(f'Average NLL of bigram model on {dataset_name}: {nll/n}')

Average NLL of bigram model on words_dev: 2.463676929473877
Average NLL of bigram model on words_test: 2.4499571323394775


In [55]:
# Trigram

used_chars = sorted(list(set("".join(words)))) + ['.']
used_chars[:10]
ctoi = {e:i for i, e in enumerate(used_chars)}
itoc = {i:e for i, e in enumerate(used_chars)}

N = torch.zeros((27, 26, 27), dtype=torch.int32)

for w in words:
    used_chars = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(used_chars, used_chars[1:], used_chars[2:]):
        ix1 = ctoi[ch1]
        ix2 = ctoi[ch2]
        ix3 = ctoi[ch3]
        N[ix1, ix2, ix3] += 1
        
P = (N+1).float()
P /= P.sum(2, keepdim=True)

for dataset, dataset_name in [(words_dev,"words_dev"), (words_test, "words_test")]:
    log_likelihood = 0.0
    n = 0

    for w in dataset:
        used_chars = ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(used_chars, used_chars[1:], used_chars[2:]):
            ix1 = ctoi[ch1]
            ix2 = ctoi[ch2]
            ix3 = ctoi[ch3]
            prob = P[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1

    nll = -log_likelihood
    print(f'Average NLL of trigram model on {dataset_name}: {nll/n}')

Average NLL of trigram model on words_dev: 2.1346964836120605
Average NLL of trigram model on words_test: 2.1141598224639893


In [56]:
# Results: Trigram is significantly better on both sets

In [57]:
# ------------------------------ Exercise 3 ------------------------------
# Use the dev set to tune the strength of smoothing (or regularization) 
# for the trigram model - i.e. try many possibilities and see which one 
# works best based on the dev set loss. 

# What patterns can you see in the train and dev set loss as you tune this strength? 

# Take the best setting of the smoothing and evaluate on the test set 
# once and at the end. How good of a loss do you achieve?

In [59]:
# Trigram

used_chars = sorted(list(set("".join(words)))) + ['.']
used_chars[:10]
ctoi = {e:i for i, e in enumerate(used_chars)}
itoc = {i:e for i, e in enumerate(used_chars)}

N = torch.zeros((27, 26, 27), dtype=torch.int32)

for w in words:
    used_chars = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(used_chars, used_chars[1:], used_chars[2:]):
        ix1 = ctoi[ch1]
        ix2 = ctoi[ch2]
        ix3 = ctoi[ch3]
        N[ix1, ix2, ix3] += 1

In [60]:
results = {}

for smoothing in [1, 2, 3, 4, 5, 10, 20, 50, 100, 200, 500, 1000, 10000]:

    P = (N+smoothing).float()
    P /= P.sum(2, keepdim=True)

    for dataset, dataset_name in [(words_dev,"words_dev"), 
                                  (words_test, "words_test")]:
        log_likelihood = 0.0
        n = 0

        for w in dataset:
            used_chars = ['.'] + list(w) + ['.']
            for ch1, ch2, ch3 in zip(used_chars, used_chars[1:], used_chars[2:]):
                ix1 = ctoi[ch1]
                ix2 = ctoi[ch2]
                ix3 = ctoi[ch3]
                prob = P[ix1, ix2, ix3]
                logprob = torch.log(prob)
                log_likelihood += logprob
                n += 1

        nll = -log_likelihood
        results[str(smoothing) + dataset_name] = nll/n
        
    print(f'ANLL with {smoothing} smoothing on words_dev: {results[str(smoothing) + "words_dev"]} on words_test: {results[str(smoothing) + "words_test"]}') 

ANLL with 1 smoothing on words_dev: 2.1346964836120605 on words_test: 2.1141598224639893
ANLL with 2 smoothing on words_dev: 2.1570229530334473 on words_test: 2.1363918781280518
ANLL with 3 smoothing on words_dev: 2.1773312091827393 on words_test: 2.1566004753112793
ANLL with 4 smoothing on words_dev: 2.1958227157592773 on words_test: 2.1750497817993164
ANLL with 5 smoothing on words_dev: 2.212879180908203 on words_test: 2.192089319229126
ANLL with 10 smoothing on words_dev: 2.2833731174468994 on words_test: 2.262901782989502
ANLL with 20 smoothing on words_dev: 2.3840253353118896 on words_test: 2.364640712738037
ANLL with 50 smoothing on words_dev: 2.563149929046631 on words_test: 2.5466408729553223
ANLL with 100 smoothing on words_dev: 2.721606969833374 on words_test: 2.708057165145874
ANLL with 200 smoothing on words_dev: 2.879863739013672 on words_test: 2.8695201873779297
ANLL with 500 smoothing on words_dev: 3.0565648078918457 on words_test: 3.0501229763031006
ANLL with 1000 smoot

In [62]:
# Patterns: The words_dev has nearly identical results to words_test
# Best loss: 2.1142

In [None]:
# ------------------------------ Exercise 4 ------------------------------

In [144]:
# We saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. 
# Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [179]:
import torch.nn.functional as F
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [180]:
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print(f'number of elements: {num}')

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of elements: 228146


In [181]:
# gradient descent
for k in range(100):
    
    # forward pass
    # xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    # logits = xenc @ W # predict log-counts
    logits = W[xs]
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean() # The '+ 0.01 * (W**2).mean()' part is for regularization to push the elements of W to 0
                                                                             # In other words it's reguralization on neural network
    if k%10==0:    
        print(loss.item())
    
    # backward pass
    W.grad = None # set gradient to zero
    loss.backward()
    
    # update
    W.data -= (51-k/2.0) * W.grad

3.768618583679199
2.701957941055298
2.5923783779144287
2.553252696990967
2.534149408340454
2.5233426094055176
2.516758680343628
2.512606620788574
2.510012149810791
2.508526563644409


In [182]:
# Yes, one_hot can be replaced with W[xs] in this case

In [183]:
g = torch.Generator().manual_seed(2147483647)

for i in range(5): 
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)
        
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        if ix == 0:
            break
        out.append(itos[ix])
    print(''.join(out))

mor
axx
minaynnnyles
kondmaisah
anchthizarie


In [169]:
# ------------------------------ Exercise 5 ------------------------------

In [190]:
# Look up and use F.cross_entropy instead. 
# You should achieve the same result. 
# Can you think of why we'd prefer to use F.cross_entropy instead?

In [191]:
import torch.nn.functional as F
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [192]:
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print(f'number of elements: {num}')

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of elements: 228146


In [193]:
# gradient descent
for k in range(100):
    
    # forward pass
    logits = W[xs]
    loss = F.cross_entropy(logits, ys)
    
    if k%10==0:    
        print(loss.item())
    
    # backward pass
    W.grad = None # set gradient to zero
    loss.backward()
    
    # update
    W.data -= (51-k/2.0) * W.grad

3.758953332901001
2.6944985389709473
2.5831985473632812
2.542685031890869
2.5225307941436768
2.510931968688965
2.503763437271118
2.4991891384124756
2.496304750442505
2.494642496109009


In [194]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)
        
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        if ix == 0:
            break
        out.append(itos[ix])
    print(''.join(out))
        

mor
axx
minaymoryles
kondmaisah
anchthizarie
