## Set 5
## 3. Word2Vec **Principles**

#### Preparation


The following function may be useful for loading the necessary data.

In [3]:
import requests

url_dict = {
    'dr_seuss.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/set5/data/dr_seuss.txt',
    'P3CHelpers.py': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/set5/code/P3CHelpers.py'
}

def download_file(file_path):
    url = url_dict[file_path]
    print('Start downloading...')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024 * 1024):
                f.write(chunk)
    print('Complete')

In [4]:
# Download files
download_file('dr_seuss.txt')
download_file('P3CHelpers.py')

Start downloading...
Complete
Start downloading...
Complete


In [5]:
import numpy as np
from P3CHelpers import *
import torch
import torch.nn as nn
import torch.optim as optim

#### Problem D:
Fill in the generate_traindata and find_most_similar_pairs functions.

In [6]:
def get_word_repr(word_to_index, word):
    """
    Returns one-hot-encoded feature representation of the specified word given
    a dictionary mapping words to their one-hot-encoded index.

    Arguments:
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        word:          Word whose feature representation we wish to compute.

    Returns:
        feature_representation:     Feature representation of the passed-in word.
    """
    unique_words = word_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[word_to_index[word]] = 1
    return feature_representation

def generate_traindata(word_list, word_to_index, window_size=4):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window. Defaults to 2
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training
                              points (one-hot-encoded vectors) and their corresponding output_word
                              (also one-hot-encoded vectors)

    """
    trainX = []
    trainY = []

    for i in range(len(word_list)):
      curr_X = get_word_repr(word_to_index, word_list[i])
      for j in range(max(0, i - window_size), min(len(word_list), i + window_size + 1)):
        curr_Y = get_word_repr(word_to_index, word_list[j])
        if i != j:
          trainX.append(curr_X)
          trainY.append(curr_Y)


    return np.array(trainX), np.array(trainY)

In [12]:
def find_most_similar_pairs(filename, num_latent_factors):
    """
    Find the most similar pairs from the word embeddings computed from
    a body of text

    Arguments:
        filename:           Text file to read and train embeddings from
        num_latent_factors: The number of latent factors / the size of the embedding
    """
    # Load in a list of words from the specified file; remove non-alphanumeric characters
    # and make all chars lowercase.
    sample_text = load_word_list(filename)

    # Create word dictionary
    word_to_index = generate_onehot_dict(sample_text)
    print("Textfile contains %s unique words"%len(word_to_index))
    # Create training data
    trainX, trainY = generate_traindata(sample_text, word_to_index)

    # define model
    model = nn.Sequential(
        nn.Linear(len(word_to_index), 10),
        nn.Linear(10, len(word_to_index)),
        nn.Softmax()
    )
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
    loss_fn = nn.CrossEntropyLoss()
    train_dataset = torch.from_numpy(np.array([trainX,trainY])).type(torch.FloatTensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
    model.train()

    for epoch in range(20):
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()

        print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

    model.eval()

    # set weights variable below
    weights = model.get_parameter('1.weight').detach()

    # Find and print most similar pairs
    similar_pairs = most_similar_pairs(weights, word_to_index)
    for pair in similar_pairs[:30]:
        print(pair)

### Problem E-H:
Run your model on drseuss.txt and answer questions from E through H.

In [15]:
find_most_similar_pairs('dr_seuss.txt', 10)

Textfile contains 308 unique words
Train Epoch: 1  Loss: 5.7300
Train Epoch: 2  Loss: 5.7299
Train Epoch: 3  Loss: 5.7298
Train Epoch: 4  Loss: 5.7298
Train Epoch: 5  Loss: 5.7297
Train Epoch: 6  Loss: 5.7296
Train Epoch: 7  Loss: 5.7295
Train Epoch: 8  Loss: 5.7294
Train Epoch: 9  Loss: 5.7293
Train Epoch: 10  Loss: 5.7292
Train Epoch: 11  Loss: 5.7290
Train Epoch: 12  Loss: 5.7288
Train Epoch: 13  Loss: 5.7286
Train Epoch: 14  Loss: 5.7284
Train Epoch: 15  Loss: 5.7281
Train Epoch: 16  Loss: 5.7278
Train Epoch: 17  Loss: 5.7275
Train Epoch: 18  Loss: 5.7271
Train Epoch: 19  Loss: 5.7266
Train Epoch: 20  Loss: 5.7260
Pair(yink, home), Similarity: 0.968291
Pair(home, yink), Similarity: 0.968291
Pair(ten, make), Similarity: 0.96137094
Pair(make, ten), Similarity: 0.96137094
Pair(old, by), Similarity: 0.9591521
Pair(by, old), Similarity: 0.9591521
Pair(read, walk), Similarity: 0.95793885
Pair(walk, read), Similarity: 0.95793885
Pair(eleven, left), Similarity: 0.95732147
Pair(left, eleven