In [1]:
#Import libraries

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.linalg
import numpy as np

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu

Prepare data

In [3]:
# CBOW is a window view; we are trying to infer the word in the middle.
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right

raw_text= """Long Short-Term Memory (LSTM) is a recurrent neural network (RNN) architecture that has been designed
to address the vanishing and exploding gradient problems of conventional RNNs. Unlike feedforward neural networks,
RNNs have cyclic connections making them powerful for modeling sequences. 
They have been successfully used for sequence labeling and sequence prediction tasks,
such as handwriting recognition, language modeling, phonetic labeling of acoustic frames. However, in contrast to the deep neural
networks, the use of RNNs in speech recognition has been limited to phone recognition in small scale tasks. 
In this paper, we present novel LSTM based RNN architectures which make more effective
use of model parameters to train acoustic models for large vocabulary speech recognition. 
We train and compare LSTM, RNN and DNN models at various numbers of parameters and configurations.
We show that LSTM models converge quickly and give state of the art speech recognition performance for relatively small sized models.""".split()

# By deriving a set from "raw_text", we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

# Basic Tokenizer
word_to_ix = {word: i for i, word in enumerate(vocab)}

print('Length of raw text :',len(raw_text))
print('Length of Vocabulary :',vocab_size)
print('Vocabulary words :',vocab)

Length of raw text : 152
Length of Vocabulary : 106
Vocabulary words : {'modeling', 'vocabulary', 'which', 'speech', 'tasks,', 'this', 'art', 'performance', 'RNNs.', '(LSTM)', 'quickly', 'has', 'DNN', 'sequences.', 'train', 'They', 'Long', 'scale', 'neural', 'use', 'network', 'more', 'contrast', 'recognition.', 'networks,', 'phone', 'relatively', 'novel', 'numbers', 'a', 'the', 'model', 'However,', 'for', 'deep', 'at', 'modeling,', 'powerful', 'address', 'used', 'such', 'architectures', 'based', 'LSTM,', 'we', 'Memory', 'We', 'recurrent', 'making', '(RNN)', 'vanishing', 'RNNs', 'gradient', 'have', 'labeling', 'phonetic', 'Unlike', 'LSTM', 'models', 'prediction', 'frames.', 'handwriting', 'small', 'been', 'in', 'recognition', 'connections', 'as', 'Short-Term', 'acoustic', 'exploding', 'show', 'effective', 'converge', 'parameters', 'make', 'models.', 'feedforward', 'large', 'language', 'architecture', 'that', 'and', 'present', 'compare', 'to', 'configurations.', 'conventional', 'of', 'gi

In [30]:
print(vocab)

{'modeling', 'vocabulary', 'which', 'speech', 'tasks,', 'this', 'art', 'performance', 'RNNs.', '(LSTM)', 'quickly', 'has', 'DNN', 'sequences.', 'train', 'They', 'Long', 'scale', 'neural', 'use', 'network', 'more', 'contrast', 'recognition.', 'networks,', 'phone', 'relatively', 'novel', 'numbers', 'a', 'the', 'model', 'However,', 'for', 'deep', 'at', 'modeling,', 'powerful', 'address', 'used', 'such', 'architectures', 'based', 'LSTM,', 'we', 'Memory', 'We', 'recurrent', 'making', '(RNN)', 'vanishing', 'RNNs', 'gradient', 'have', 'labeling', 'phonetic', 'Unlike', 'LSTM', 'models', 'prediction', 'frames.', 'handwriting', 'small', 'been', 'in', 'recognition', 'connections', 'as', 'Short-Term', 'acoustic', 'exploding', 'show', 'effective', 'converge', 'parameters', 'make', 'models.', 'feedforward', 'large', 'language', 'architecture', 'that', 'and', 'present', 'compare', 'to', 'configurations.', 'conventional', 'of', 'give', 'sized', 'paper,', 'RNN', 'tasks.', 'In', 'state', 'cyclic', 'prob

In [31]:
# list out keys and values separately
key_list = list(word_to_ix.keys())
val_list = list(word_to_ix.values())

creating dataset

In [32]:
# Now lets create a "dataset"
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = []
    for j in range(CONTEXT_SIZE, 0, -1):
        context.append(raw_text[i - j])

    for j in range(1, CONTEXT_SIZE + 1):
        context.append(raw_text[i + j])
        
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['Long', 'Short-Term', '(LSTM)', 'is'], 'Memory'), (['Short-Term', 'Memory', 'is', 'a'], '(LSTM)'), (['Memory', '(LSTM)', 'a', 'recurrent'], 'is'), (['(LSTM)', 'is', 'recurrent', 'neural'], 'a'), (['is', 'a', 'neural', 'network'], 'recurrent')]


create CBOW model

In [33]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embed_dim, context, hidden_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Sequential(
            nn.Linear(context*embed_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size),
            nn.LogSoftmax(dim = -1)
        )
        
    def forward(self, inputs):
#         print(inputs.shape)
#         print(inputs)
        out = self.embedding(inputs)
#         print(out.shape)
        out = out.view(1, -1)
#         print(out.shape)
        out = self.linear(out)
#         print(out.shape)
        return out
    
    # This is what we are actually interested on
    def get_word_vector(self, word):
        out = self.embedding(word)
        return out

In [34]:
VOCAB_SIZE = len(vocab)
EMBEDD_DIM = 10
BATCH_SIZE = 6
FULL_CONTEXT_SIZE = CONTEXT_SIZE * 2
HIDDEN_SIZE = 256

example_tensor = torch.randint(0, VOCAB_SIZE, [BATCH_SIZE, FULL_CONTEXT_SIZE])
print(example_tensor)

tensor([[ 76,  42,  89,  22],
        [ 38,  41,  13, 100],
        [  9,  58,  29,  66],
        [  2,  22, 102,  20],
        [ 45,  25,  10, 100],
        [ 22,  58,  10,  83]])


In [35]:
CBOW_embedding = nn.Embedding(VOCAB_SIZE, EMBEDD_DIM)
print(example_tensor.shape)
example_result = CBOW_embedding(example_tensor)
# Now we have a representation of the words in a vector of EMBEDD_DIM Dimensions
print(example_result.shape)
# example_result = torch.flatten(example_result, start_dim=1)
example_result = example_result.view(BATCH_SIZE, -1)
print(example_result.shape)

torch.Size([6, 4])
torch.Size([6, 4, 10])
torch.Size([6, 40])


In [36]:
print("input shape: ", EMBEDD_DIM * FULL_CONTEXT_SIZE)
print("output shape: ", HIDDEN_SIZE)
CBOW_hidden = nn.Linear(EMBEDD_DIM * FULL_CONTEXT_SIZE, HIDDEN_SIZE)
CBOW_hidden_relu = nn.ReLU()
example_result = CBOW_hidden(example_result)
example_result = CBOW_hidden_relu(example_result)
print(example_result.shape)

input shape:  40
output shape:  256
torch.Size([6, 256])


In [37]:
CBOW_output = nn.Linear(HIDDEN_SIZE, VOCAB_SIZE)
CBOW_output_soft = nn.LogSoftmax(dim = -1)
example_result = CBOW_output(example_result)
example_result = CBOW_output_soft(example_result)
print(example_result.shape)

torch.Size([6, 106])


In [38]:
print(example_result[0].argmax(-1))
print(key_list[val_list.index(example_result[0].argmax(-1))])
print(example_result[0])
print(example_result[1].argmax(-1))
print(key_list[val_list.index(example_result[1].argmax(-1))])
print(example_result[1])
# print(example_result[2].argmax(-1))
# print(example_result[3].argmax(-1))
# print(example_result[4].argmax(-1))

tensor(40)
such
tensor([-4.5712, -4.4901, -4.9160, -4.6440, -4.9757, -4.7100, -5.1163, -4.5753,
        -4.7635, -4.5510, -4.7088, -4.5678, -4.8451, -4.4883, -4.5544, -4.3537,
        -4.4998, -4.3164, -4.3987, -4.6746, -4.5824, -4.8035, -4.4392, -4.7948,
        -4.8199, -5.0696, -5.1145, -4.5310, -4.7439, -4.7126, -4.7040, -4.7572,
        -4.8012, -5.0589, -4.7325, -4.8628, -4.6117, -4.6089, -5.0876, -4.7128,
        -4.2631, -4.9567, -4.6783, -4.5243, -4.5995, -4.6649, -4.6193, -4.7021,
        -4.3589, -4.6332, -4.6557, -4.3562, -4.8026, -4.6214, -4.9967, -4.2872,
        -4.7037, -4.6921, -4.5436, -4.7300, -4.2698, -4.7494, -4.7447, -4.5764,
        -4.5991, -4.7304, -5.0402, -4.3886, -4.5067, -4.7539, -4.7784, -4.8041,
        -4.6357, -4.4745, -4.5036, -4.6260, -4.5046, -4.8630, -4.6755, -4.7636,
        -4.6512, -4.3036, -4.6592, -4.5385, -4.7848, -4.9297, -4.8805, -4.7054,
        -4.6074, -4.5297, -5.1102, -4.7816, -4.6892, -4.8389, -4.4032, -4.8663,
        -4.6802, -4.8109

In [50]:
# Simple helper method to transform the context to the expected int vector - tensor

def make_context_vector(context, word_to_ix, debug=False):
    if debug:
      print(context)
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

make_context_vector(data[0][0], word_to_ix, debug=True)

['Long', 'Short-Term', '(LSTM)', 'is']


tensor([ 16,  68,   9, 102])

In [40]:
def train(model, epochs, data, optimizer, loss_fn):
    model.train()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for context, target in data:

            # Prepare inputs and targets 
            context_idxs = make_context_vector(context, word_to_ix)
            context_idxs = context_idxs.to(device)
            target_id = make_context_vector([target], word_to_ix)
            target_id = target_id.to(device)

            # Do not accumulate 
            model.zero_grad()

            # Step 3. Run the forward pass
            log_probs = model(context_idxs)
    #         break

            # Step 4. Compute your loss function.
            loss = loss_fn(log_probs, target_id)

    #         loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

            # Step 5. Do the backward pass and update the gradient
            loss.backward()
            optimizer.step()

            # Get the Python number from a 1-element Tensor by calling tensor.item()
            total_loss += loss.item()
        losses.append(total_loss)
    return losses

In [49]:
VOCAB_SIZE = len(vocab)
EMBEDD_DIM = 10
BATCH_SIZE = 6
FULL_CONTEXT_SIZE = CONTEXT_SIZE * 2
HIDDEN_SIZE = 256

loss_function = nn.NLLLoss() # Because we are using Log_softmax
model = CBOW(vocab_size, EMBEDD_DIM, FULL_CONTEXT_SIZE, HIDDEN_SIZE)
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

losses = train(model, 100, data, optimizer, loss_function)
model.eval()

print(losses)  # The loss decreased every iteration over the training data!

[694.9155240058899, 687.7885828018188, 680.7643623352051, 673.8368592262268, 666.9971764087677, 660.2417123317719, 653.5664665699005, 646.9686319828033, 640.4414632320404, 633.981306552887, 627.5825419425964, 621.2449505329132, 614.9628708362579, 608.7338593006134, 602.5551338195801, 596.4200174808502, 590.3236186504364, 584.2661881446838, 578.2428784370422, 572.2481138706207, 566.2771768569946, 560.3262183666229, 554.3945150375366, 548.4782063961029, 542.5740841627121, 536.6808401346207, 530.794450044632, 524.9121680259705, 519.0319414138794, 513.152193903923, 507.2723559141159, 501.38893365859985, 495.5020614862442, 489.6112332344055, 483.7154550552368, 477.81189143657684, 471.9032258987427, 465.9856730699539, 460.0601872205734, 454.12520921230316, 448.1791104078293, 442.2268294095993, 436.26587212085724, 430.2998768091202, 424.3313487172127, 418.3587952852249, 412.3818310499191, 406.40055698156357, 400.41673988103867, 394.43248891830444, 388.44926220178604, 382.46931463479996, 376.4

In [42]:
# list out keys and values separately
key_list = list(word_to_ix.keys())
val_list = list(word_to_ix.values())

In [43]:
def similarity_cbow(word_1, word_2):
    
    # test word similarity
    print(word_1)
    print(word_2)
    w1_id = torch.tensor(word_to_ix[word_1], dtype=torch.long)
    w2_id = torch.tensor(word_to_ix[word_2], dtype=torch.long)
    w1_id = w1_id.to(device)
    w2_id = w2_id.to(device)
    
    word_1_vec = model.get_word_vector(w1_id)
    word_2_vec = model.get_word_vector(w2_id)
    
    # The norm of a vector (1D-matrix) is the square root of the sum of all the squared values within the vector.
    print(math.sqrt(torch.square(word_1_vec).sum()))    
    print(torch.linalg.norm(word_1_vec))
    print(torch.linalg.norm(word_2_vec))
    print(word_1_vec.dot(word_2_vec))
    
    word_distance = torch.linalg.norm(word_1_vec - word_2_vec)
    print("Distance between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_distance))
    word_similarity = (word_1_vec.dot(word_2_vec) / (torch.linalg.norm(word_1_vec) * torch.linalg.norm(word_2_vec)))
    print("Similarity between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_similarity))

In [44]:
similarity_cbow("neural", "network")

neural
network
3.3346826842191675
tensor(3.3347, grad_fn=<LinalgVectorNormBackward0>)
tensor(2.4982, grad_fn=<LinalgVectorNormBackward0>)
tensor(3.7424, grad_fn=<DotBackward0>)
Distance between 'neural' & 'network' : 3.1427
Similarity between 'neural' & 'network' : 0.4492


Predict middle word

In [21]:
def predict_middle_word(prev_words, post_words):
    prev_words = prev_words.split()
    post_words = post_words.split()

    input_words= make_context_vector(prev_words + post_words, word_to_ix)
    input_words = input_words.to(device)
    output = model(input_words)
    out_ind = output.argmax(1)
#     print(word_to_ix)
#     out_word = word_to_ix.itos[out_ind.item()]
    out_word = key_list[val_list.index(out_ind.item())]
    print(prev_words, out_word, post_words)

In [22]:
predict_middle_word("a recurrent", "network is")
predict_middle_word("LSTM is", "recurrent neural")

['a', 'recurrent'] neural ['network', 'is']
['LSTM', 'is'] a ['recurrent', 'neural']


Exercises

Skim Gram model

In [45]:
class SkipGram(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        
        self.embed = nn.Embedding(n_vocab, n_embed)
        self.output = nn.Linear(n_embed, n_vocab)
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.vocab_size = n_vocab
    
    def forward(self, x):
        x = self.embed(x)
        scores = self.output(x)
        out = scores.view(1, -1)
        log_ps = self.log_softmax(out)
        return log_ps

    # This is what we are actually interested on
    def get_word_vector_skipgram(self, word):
        out = self.embed(word)
        return out

In [46]:
losses = []
embedding_dim=10 

model_skipgram = SkipGram(len(vocab), embedding_dim).to(device)
model_skipgram.eval
criterion = nn.NLLLoss()
optimizer = optim.Adam(model_skipgram.parameters(), lr=0.01)
losses    = train(model_skipgram, 100, data, optimizer, loss_function)

print('Losses: ',losses)
print('Mean Loss: ',np.mean(losses))

Losses:  [946.5228967666626, 830.6464099884033, 745.1455674171448, 666.3138654232025, 595.9658558368683, 535.750091791153, 486.99715662002563, 449.28949987888336, 421.2720048427582, 401.42221039533615, 387.8040924668312, 378.0851792693138, 370.94057777523994, 365.5768465101719, 361.4219214916229, 358.08164082467556, 355.31496150791645, 353.00439324975014, 351.06703843176365, 349.87192968279123, 350.4710729953367, 351.9245348870172, 348.5440613548835, 346.98533415829297, 344.98445374681614, 344.2727210158482, 342.95775878988206, 343.4171077106148, 342.2626666156575, 344.0949020028056, 344.78243397455645, 343.2906290794199, 340.7552293064073, 340.228663223912, 338.06291420874186, 337.7984118834138, 336.84863575559575, 336.6364579007495, 336.06705347518437, 336.0804946925491, 335.7842615917325, 336.71470846934244, 337.79788395692594, 343.9487183544254, 338.9314598895144, 338.2907630307991, 335.60152107883914, 335.72402138506277, 333.93825925395777, 333.62510053787355, 332.9628950543283, 3

In [47]:
model_skipgram.eval

<bound method Module.eval of SkipGram(
  (embed): Embedding(106, 10)
  (output): Linear(in_features=10, out_features=106, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)>

In [57]:
def similarity_skipgram(word_1, word_2):
    
    # test word similarity
    print(word_1)
    print(word_2)
    w1_id = torch.tensor(word_to_ix[word_1], dtype=torch.long)
    w2_id = torch.tensor(word_to_ix[word_2], dtype=torch.long)
    w1_id = w1_id.to(device)
    w2_id = w2_id.to(device)
    
    word_1_vec = model_skipgram.get_word_vector_skipgram(w1_id)
    word_2_vec = model_skipgram.get_word_vector_skipgram(w2_id)
    
    # The norm of a vector (1D-matrix) is the square root of the sum of all the squared values within the vector.
    print(math.sqrt(torch.square(word_1_vec).sum()))    
    print(torch.linalg.norm(word_1_vec))
    print(torch.linalg.norm(word_2_vec))
    print(word_1_vec.dot(word_2_vec))
    
    word_distance = torch.linalg.norm(word_1_vec - word_2_vec)
    print("Distance between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_distance))
    word_similarity = (word_1_vec.dot(word_2_vec) / (torch.linalg.norm(word_1_vec) * torch.linalg.norm(word_2_vec)))
    print("Similarity between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_similarity))

In [58]:
similarity_skipgram("neural", "network")

neural
network
3.4954553799166543
tensor(3.4955, grad_fn=<LinalgVectorNormBackward0>)
tensor(5.7821, grad_fn=<LinalgVectorNormBackward0>)
tensor(-5.0769, grad_fn=<DotBackward0>)
Distance between 'neural' & 'network' : 7.4703
Similarity between 'neural' & 'network' : -0.2512
