In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# reading the data
words = open("../../data/names.txt", "r").read().splitlines()

# Exploring
print(f"first 10 words{words[:10]}")
print(f"length of words: {len(words)}")
print(f"min word length {min(len(w) for (w) in words)} and max word length {max(len(w) for (w) in words)}")

first 10 words['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
length of words: 32033
min word length 2 and max word length 15


# E01: Train a Trigram Language Model

In [3]:
# make a list of characters (a -> z)
chars = sorted(list(set("".join(words))))
chars = ["."] + chars

# # make a dictionary of character to index
stoi = {ch: i for (i, ch) in enumerate(chars)}

# # make a dictionary of index to character
itos = {i: ch for (ch, i) in stoi.items()}

## 1- Using Counting

In [4]:
N = torch.ones(27, 27, 27, dtype = torch.int32, device = device)
N[0, 0, 0] = 0
# getting the Bigrams
for w in words:
    # add start and end tokens
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]

        N[ix1, ix2, ix3] += 1

P = N / N.sum(dim = 2, keepdim = True)

### Loss Function

In [5]:
def count_loss(input_list, verbose = False):
    log_likelihood = 0.0
    n = 0
    for w in input_list:
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]

            prob = P[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
            
            if verbose:
                print(f"{ch1}{ch2} -> {prob:.4f} {logprob:.4f}")

    # higher the log likelihood (closer to 0) is better
    print(f"log Likelihood: {log_likelihood}")

    # but in loss function lower is better, so we negate it
    nll = -log_likelihood
    print(f"Negative log likelihood: {nll}")

    # normalize it
    print(f"Normalized Negative log Likelihood: {(nll / n)}") # we need to minimize this

In [6]:
print("Training Loss")
count_loss(words)

Training Loss
log Likelihood: -410414.96875
Negative log likelihood: 410414.96875
Normalized Negative log Likelihood: 2.092747449874878


### Sampling

In [7]:
# Sampling
names = []
for i in range(10):
    out = []
    ix1, ix2 = 0, 0
    while True:
        p = P[ix1, ix2]
        ix1 = ix2
        ix2 = torch.multinomial(p, 1, replacement=True).item()
        if ix2 == 0:
            break
        out.append(itos[ix2])

    names.append("".join(out))
    
print(names)
print("Sampled words Loss")
count_loss(names)

['tori', 'oserdaloreta', 'blailyn', 'rajr', 'gruug', 'iva', 'hun', 'wyn', 'tyclaznpcnko', 'gren']
Sampled words Loss
log Likelihood: -139.654052734375
Negative log likelihood: 139.654052734375
Normalized Negative log Likelihood: 2.450071096420288


## 2- Using MLP

In [8]:
# Prepare the training set
xs , ys = [], []

for w in words:
    # add start and end tokens
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]

        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs, dtype=torch.int64)
ys = torch.tensor(ys, dtype=torch.int64)


In [9]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(xs)), ys].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.6715
10: 2.6098
20: 2.4700
30: 2.4215
40: 2.3986
50: 2.3859
60: 2.3784
70: 2.3736
80: 2.3705
90: 2.3684
100: 2.3670
110: 2.3660
120: 2.3654
130: 2.3649
140: 2.3646
150: 2.3643
160: 2.3642
170: 2.3640
180: 2.3640
190: 2.3639


### Sample from the model

In [10]:
names = []
for i in range(10):
    out = []
    ix1, ix2 = 0, 0
    while True:
        # previosly we used P[ix]
        # p = P[ix]

        # now we use the softmax of the logits
        xenc = F.one_hot(torch.tensor([ix1, ix2]).to(device), num_classes = 27).float().to(device)
        xenc = xenc.view(-1, 27*2)
        
        logits = xenc @ W
        counts = torch.exp(logits)
        p = counts / counts.sum(dim = 1, keepdim = True)

        ix1 = ix2
        ix2 = torch.multinomial(p.to(device), num_samples = 1 , replacement = True).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break

    names.append("".join(out))
    
for name in names:
    print(name)
count_loss(names)

us.
lelin.
ra.
viia.
benon.
lirisin.
ejkylo.
elina.
ounnre.
igpe.
log Likelihood: -156.45962524414062
Negative log likelihood: 156.45962524414062
Normalized Negative log Likelihood: 2.793921947479248


# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set.

In [11]:
# prepare the dataset
from sklearn.model_selection import train_test_split

words_train, words_test = train_test_split(words, test_size=0.2, random_state=1234)
words_dev, words_test = train_test_split(words_test, test_size=0.5, random_state=1234)

x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []
for wgroup in [words_train, words_dev, words_test]:
    xs , ys = [], []
    for w in wgroup:
        # add start and end tokens
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
        
            xs.append([ix1, ix2])
            ys.append(ix3)

    xs = torch.tensor(xs, dtype=torch.int64)
    ys = torch.tensor(ys, dtype=torch.int64)

    if wgroup == words_train:
        x_train, y_train = xs, ys
    elif wgroup == words_dev:
        x_dev, y_dev = xs, ys
    else:
        x_test, y_test = xs, ys


In [12]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(x_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
    # add regularization
    # loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.1824
10: 2.4879
20: 2.3713
30: 2.3283
40: 2.3057
50: 2.2916
60: 2.2819
70: 2.2749
80: 2.2696
90: 2.2655
100: 2.2622
110: 2.2595
120: 2.2573
130: 2.2555
140: 2.2539
150: 2.2525
160: 2.2513
170: 2.2503
180: 2.2494
190: 2.2486


In [13]:
def MLP_loss(x, y, W):
    xenc = F.one_hot(x, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)

    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x)), y].log().mean()
    
    return loss.item()

In [14]:
print(f"Train Loss: {MLP_loss(x_train, y_train, W):.4f}")
print(f"Dev Loss: {MLP_loss(x_dev, y_dev, W):.4f}")
print(f"Test Loss: {MLP_loss(x_test, y_test, W):.4f}")

Train Loss: 2.2478
Dev Loss: 2.2538
Test Loss: 2.2511


# E03: use the dev set 
to tune the strength of smoothing (or regularization) for the trigram model

In [15]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(x_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
    # add regularization
    # loss += 0.05 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(x_dev, y_dev, W):.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: Train Loss: 4.1959 | Dev Loss 4.1741
10: Train Loss: 2.5150 | Dev Loss 2.5090
20: Train Loss: 2.3855 | Dev Loss 2.3831
30: Train Loss: 2.3360 | Dev Loss 2.3357
40: Train Loss: 2.3102 | Dev Loss 2.3110
50: Train Loss: 2.2947 | Dev Loss 2.2961
60: Train Loss: 2.2843 | Dev Loss 2.2861
70: Train Loss: 2.2769 | Dev Loss 2.2790
80: Train Loss: 2.2713 | Dev Loss 2.2738
90: Train Loss: 2.2670 | Dev Loss 2.2698
100: Train Loss: 2.2636 | Dev Loss 2.2666
110: Train Loss: 2.2608 | Dev Loss 2.2640
120: Train Loss: 2.2585 | Dev Loss 2.2619
130: Train Loss: 2.2566 | Dev Loss 2.2601
140: Train Loss: 2.2549 | Dev Loss 2.2587
150: Train Loss: 2.2535 | Dev Loss 2.2574
160: Train Loss: 2.2522 | Dev Loss 2.2563
170: Train Loss: 2.2511 | Dev Loss 2.2553
180: Train Loss: 2.2501 | Dev Loss 2.2544
190: Train Loss: 2.2492 | Dev Loss 2.2536


comment: no regularization is better

# E04: Rewrite the MLP model without creating one hot vectors
we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [16]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    # ====================
    # Previously: using onehot and multiplying by W 
    # xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    # xenc = xenc.view(-1, 27*2)
    # logits = xenc @ W
    # ====================

    # ====================
    # ✅ now: acess by xs indices directly
    logits = W[xs[:,0]] + W[xs[:,1] + 27]
    # ====================
    
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(xs)), ys].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.2724
10: 2.5831
20: 2.4545
30: 2.4076
40: 2.3848
50: 2.3720
60: 2.3644
70: 2.3595
80: 2.3564
90: 2.3544
100: 2.3530
110: 2.3520
120: 2.3514
130: 2.3509
140: 2.3506
150: 2.3504
160: 2.3503
170: 2.3501
180: 2.3501
190: 2.3500


# E05: look up and use F.cross_entropy instead
nn.functonal.cross_entropy() takes the logits and the target class as input and returns the cross entropy loss directly

In [17]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    logits = xenc @ W
    
    loss = torch.nn.functional.cross_entropy(logits, ys.to(device))
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.1529
10: 2.5952
20: 2.4588
30: 2.4093
40: 2.3856
50: 2.3725
60: 2.3647
70: 2.3598
80: 2.3566
90: 2.3545
100: 2.3531
110: 2.3521
120: 2.3514
130: 2.3510
140: 2.3507
150: 2.3504
160: 2.3503
170: 2.3501
180: 2.3501
190: 2.3500


# E06: meta-exercise! Think of a fun/interesting exercise and complete it
we will reimplment the MLP model using pytorch nn.Module

In [163]:
class MLP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(27*2, 27)
        # initialize weights with normal distribution with mean 0 and std 1
        torch.nn.init.normal_(self.fc1.weight, mean = 0, std = 1)
        
    def forward(self, xs):
        logits = W[xs[:,0]] + W[xs[:,1] + 27]
        return logits

In [175]:
model = MLP().to(device)
for k in range(200):
    # forward pass
    logits = model(xs)
    
    loss = torch.nn.functional.cross_entropy(logits, ys.to(device))
    # add regularization
    loss += 0.2 * model.fc1.weight.data.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    model.fc1.zero_grad() # reset the gradients of the layer
    loss.backward()

    # update weights
    with torch.no_grad():
        model.fc1.weight.data -= 50 * model.fc1.weight.grad # use linear.weight instead of W

0: 4.4746
10: 2.6566
20: 2.5239
30: 2.4767
40: 2.4558
50: 2.4462
60: 2.4419
70: 2.4405
80: 2.4408
90: 2.4420
100: 2.4438
110: 2.4460
120: 2.4485
130: 2.4511
140: 2.4539
150: 2.4567
160: 2.4596
170: 2.4625
180: 2.4654
190: 2.4683
