In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

## P1) Analyse the dataset

In [2]:
corpus = [line.strip() for line in open('./TheTimeMachine.txt') if line.strip()][2:]
print("\n".join(corpus[:10]))

# Tokenize the sentences into words. All lower case. Ignore punctuation.
corpus = [re.sub('[^A-Za-z0-9]+', ' ', line).lower() for line in corpus]
corpus = [re.sub(' +', ' ', line) for line in corpus]
corpus = [word for line in corpus for word in line.split()]

The Time Traveller (for so it will be convenient to speak of him)
was expounding a recondite matter to us. His grey eyes shone and
twinkled, and his usually pale face was flushed and animated. The
fire burned brightly, and the soft radiance of the incandescent
lights in the lilies of silver caught the bubbles that flashed and
passed in our glasses. Our chairs, being his patents, embraced and
caressed us rather than submitted to be sat upon, and there was that
luxurious after-dinner atmosphere when thought roams gracefully
free of the trammels of precision. And he put it to us in this
way--marking the points with a lean forefinger--as we sat and lazily


In [3]:
vocab_size = 3000
tkn_counter = Counter([word for word in corpus])
vocab = {word: idx for idx, (word, _) in enumerate(tkn_counter.most_common(vocab_size))}
vocab["/UNK"] = len(vocab)

  * Found 3001 unique words in the provided corpus (of size 32767).
  * Created vocabulary from corpus.
  * The 10 most common words are the following:
[('the', 2260), ('i', 1266), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440)]


In [4]:
class TextCorpusDataset(Dataset):
    def __init__(self, corpus, vocab, snippet_len=50):
        super().__init__()
        self.corpus = corpus
        self.snippet_len = snippet_len

        # Vocabulary (word-to-index mapping)
        self.vocab = vocab

        # Inverse vocabulary (index-to-word mapping)
        self.inv_vocab = {idx: word for word, idx in self.vocab.items()}

    def convert2idx(self, word_sequence):
        return [self.vocab[word if word in self.vocab else "/UNK"] for word in word_sequence]

    def convert2words(self, idx_sequence):
        return [self.inv_vocab[idx] for idx in idx_sequence]

    def __len__(self):
        return len(self.corpus) - self.snippet_len

    def __getitem__(self, idx):
        snippet = self.corpus[idx:idx+self.snippet_len]
        snippet = torch.tensor(self.convert2idx(snippet))
        return snippet

# Test dataset function
dataset = TextCorpusDataset(corpus, vocab, snippet_len=50)
snippet = dataset[1234]
print("\nRandom snippet from the corpus.")
print("  * Token IDS:\t", snippet)
print("  * Words:\t\t", " ".join([dataset.inv_vocab[i] for i in snippet.tolist()]))


Random snippet from the corpus.
  * Token IDS:	 tensor([ 312,   54,   27,   42,  600,    3, 1472,  110,   15,  108,  439,    3,
          18,  108,   72,  130,    4,  849,   51,   52,  370,  187,    3, 1472,
        2275,  231,  182,    0,  235,   17,    4, 1473,   64,   37,  371,  151,
         130,    0,  849,    7,   20, 2276,   26,  188,  219,   63,  140, 1462,
           7,    4])
  * Words:		 course we have no means of staying back for any length of time any more than a savage or an animal has of staying six feet above the ground but a civilized man is better off than the savage in this respect he can go up against gravitation in a


## P2) The CBOW Embeddings

In [5]:
# Define the Word2Vec CBOW model
class Word2Vec_CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(???)
        self.linear = nn.Linear(???)

    def forward(self, context):
        context_embeds = ???
        avg_context_embeds = ???
        logits = ???
        return logits

# Hyperparameters
context_len = 2
vocab_size = len(dataset.vocab)
embedding_dim = 128
learning_rate = 5e-3
num_epochs = 100

# Create DataLoader for batch training
dataset = TextCorpusDataset(corpus, vocab, snippet_len=2*context_len + 1)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Create and train the CBOW model
w2v = Word2Vec_CBOW(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(w2v.parameters(), lr=learning_rate)
context_idx = [idx for idx in range(2*context_len+1) if idx != context_len]
for epoch in range(num_epochs):
    total_loss = 0
    for snippet in train_loader:
        context = snippet[:, context_idx].to(device)
        target = snippet[:, context_len].to(device)
        logits = w2v(context)
        loss = criterion(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}')


Epoch [1/100], Loss: 6.4501
Epoch [2/100], Loss: 4.4778
Epoch [3/100], Loss: 3.5596
Epoch [4/100], Loss: 2.9592
Epoch [5/100], Loss: 2.5170
Epoch [6/100], Loss: 2.1958
Epoch [7/100], Loss: 1.9744
Epoch [8/100], Loss: 1.8043
Epoch [9/100], Loss: 1.6863
Epoch [10/100], Loss: 1.5872
Epoch [11/100], Loss: 1.5175
Epoch [12/100], Loss: 1.4666
Epoch [13/100], Loss: 1.4177
Epoch [14/100], Loss: 1.3788
Epoch [15/100], Loss: 1.3430
Epoch [16/100], Loss: 1.3044
Epoch [17/100], Loss: 1.2911
Epoch [18/100], Loss: 1.2571
Epoch [19/100], Loss: 1.2416
Epoch [20/100], Loss: 1.2216
Epoch [21/100], Loss: 1.2056
Epoch [22/100], Loss: 1.1903
Epoch [23/100], Loss: 1.1848
Epoch [24/100], Loss: 1.1647
Epoch [25/100], Loss: 1.1616
Epoch [26/100], Loss: 1.1508
Epoch [27/100], Loss: 1.1452
Epoch [28/100], Loss: 1.1376
Epoch [29/100], Loss: 1.1248
Epoch [30/100], Loss: 1.1130
Epoch [31/100], Loss: 1.1051
Epoch [32/100], Loss: 1.1079
Epoch [33/100], Loss: 1.0963
Epoch [34/100], Loss: 1.0889
Epoch [35/100], Loss: 1

In [1]:
# Extract the word embeddings to analyze it
word_embeddings = w2v.embeddings.weight.detach().cpu().numpy()

NameError: name 'w2v' is not defined

## P3) Next-word prediction using CBOW embeddings

In [7]:
class NextWordPredictionMLP(nn.Module):
    def __init__(self, num_context, embedding, depth=3, hidden_dim=50):
        super().__init__()
        self.embedding = embedding

        self.mlp = nn.Sequential()
        for d in range(depth):
            if d == 0:
                in_chans = num_context * embedding.embedding_dim
                out_chans = hidden_dim
            elif d == depth - 1:
                in_chans = hidden_dim
                out_chans = embedding.num_embeddings
            else:
                in_chans = out_chans = hidden_dim

            self.mlp.add_module(f'linear{d}', nn.Linear(in_chans, out_chans))
            self.mlp.add_module(f'bn{d}', nn.BatchNorm1d(out_chans))
            self.mlp.add_module(f'act{d}', nn.ReLU(inplace=True))

    def forward(self, context):
        emb = self.embedding(context).flatten(1)
        return self.mlp(emb)

In [None]:
def train_one_epoch(model, loss_fcn, optimizer, dataloader):
    total_loss = 0.
    for it, batch in enumerate(dataloader):
        batch_past = batch[:, :T].to(device)
        batch_now = batch[:, -1].to(device)

        pred_now = model(batch_past)
        l = loss_fcn(pred_now, batch_now)
        total_loss += l.item()

        optimizer.zero_grad()
        l.backward()
        optimizer.step()

    total_loss = total_loss / len(dataloader)
    return total_loss


def fit(model, loss_fcn, dataloader, optimizer, epochs=30):
    """
    Helper function to train a model.
    """
    for ep in range(epochs):
        loss = train_one_epoch(model, loss_fcn, optimizer, dataloader)
        print(f"[Ep{ep:03}] | Loss {loss:.3f} \t Perplexity  {np.exp(loss):.3f}")


T = 10
dataset = TextCorpusDataset(corpus, vocab, snippet_len=T+1)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

model = NextWordPredictionMLP(T, w2v.embeddings, depth=2, hidden_dim=50).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_fcn = F.cross_entropy

fit(model, loss_fcn, dataloader, opt, epochs=100)

[Ep000] | Loss 7.657 	 Perplexity  2115.721
[Ep001] | Loss 6.586 	 Perplexity  724.957
[Ep002] | Loss 5.910 	 Perplexity  368.606
[Ep003] | Loss 5.414 	 Perplexity  224.628
[Ep004] | Loss 5.046 	 Perplexity  155.433
[Ep005] | Loss 4.758 	 Perplexity  116.530
[Ep006] | Loss 4.528 	 Perplexity  92.540
[Ep007] | Loss 4.347 	 Perplexity  77.284
[Ep008] | Loss 4.194 	 Perplexity  66.294
[Ep009] | Loss 4.054 	 Perplexity  57.600
[Ep010] | Loss 3.937 	 Perplexity  51.280
[Ep011] | Loss 3.828 	 Perplexity  45.967
[Ep012] | Loss 3.729 	 Perplexity  41.624
[Ep013] | Loss 3.646 	 Perplexity  38.327
[Ep014] | Loss 3.564 	 Perplexity  35.312
[Ep015] | Loss 3.475 	 Perplexity  32.310
[Ep016] | Loss 3.407 	 Perplexity  30.170
[Ep017] | Loss 3.348 	 Perplexity  28.432
[Ep018] | Loss 3.271 	 Perplexity  26.338
[Ep019] | Loss 3.220 	 Perplexity  25.038
[Ep020] | Loss 3.162 	 Perplexity  23.609
[Ep021] | Loss 3.104 	 Perplexity  22.288
[Ep022] | Loss 3.044 	 Perplexity  20.990
[Ep023] | Loss 2.999 	 Perp

In [None]:
with torch.no_grad():
    prompt = " ".join(corpus[:10])
    print("PROMPT:", prompt)
    context = torch.tensor([dataset.vocab[word] for word in prompt.split()]).to(device)
    context = context.unsqueeze(0)  # Reshape it into a batch of 1
    model.train(False)
    for _ in range(100):
        next_word_logits = model(context)
        next_word_idx = next_word_logits[:, :-1].argmax(dim=1)
        next_word = dataset.inv_vocab[next_word_idx[0].item()]
        context = torch.cat((context[:, 1:], next_word_idx.unsqueeze(1)), 1)
        print(next_word, end=' ')