# Problem 3
Authors: Sid Murching, Suraj Nair, Alex Cui

In [1]:
import numpy as np
from P3CHelpers import *
from keras.models import Sequential
from keras.layers.core import Dense, Activation

import sys

Using TensorFlow backend.


## 3D:
Fill in the generate_traindata and find_most_similar_pairs functions

In [2]:
def get_word_repr(word_to_index, word): # 生成词向量
    """
    Returns one-hot-encoded feature representation of the specified word given
    a dictionary mapping words to their one-hot-encoded index.

    Arguments:
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        word:          String containing word whose feature representation we wish to compute.

    Returns:
        feature_representation:     Feature representation of the passed-in word.
    """
    unique_words = word_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[word_to_index[word]] = 1
    return feature_representation    

def generate_traindata(word_list, word_to_index, window_size=4):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window.
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training 
                              points (one-hot-encoded vectors representing individual words) and 
                              their corresponding labels (also one-hot-encoded vectors representing words).

                              For each index i, trainX[i] should correspond to a word in
                              <word_list>, and trainY[i] should correspond to one of the words within
                              a window of size <window_size> of trainX[i].
    """
    trainX = []
    trainY = []
    # TODO: Implement this function, populating trainX and trainY
    for i in range(len(word_list)):
        for j in range(-window_size, window_size + 1):
            if i + j >= 0 and i + j < len(word_list) and j != 0:
                point_X = get_word_repr(word_to_index, word_list[i]) # vector of the word in word_list
                trainX.append(point_X) # 目标单词 词向量的矩阵
                point_Y = get_word_repr(word_to_index, word_list[i+j]) # vector of other words in the window
                trainY.append(point_Y) # 除目标单词以外的窗口中的一个词 词向量的矩阵
    return (np.array(trainX), np.array(trainY))

In [3]:
def find_most_similar_pairs(filename, num_latent_factors):
    """
    Find the most similar pairs from the word embeddings computed from
    a body of text
    
    Arguments:
        filename:           Text file to read and train embeddings from
        num_latent_factors: The number of latent factors / the size of the embedding
    """
    # Load in a list of words from the specified file; remove non-alphanumeric characters
    # and make all chars lowercase.
    sample_text = load_word_list(filename)
    print('sample_text length', len(sample_text))

    # Create dictionary mapping unique words to their one-hot-encoded index
    word_to_index = generate_onehot_dict(sample_text)
    # Create training data using default window size
    trainX, trainY = generate_traindata(sample_text, word_to_index)
    print('trainX.shape = ', trainX.shape, 'trainY.shape = ', trainY.shape)
    
    # TODO: 1) Create and train model in Keras.      
    
    # vocab_size = number of unique words in our text file. Will be useful when adding layers
    # to your neural network
    vocab_size = len(word_to_index) # input dim
    model = Sequential()
    model.add(Dense(num_latent_factors, input_dim=(vocab_size))) # a single hidden layer of num_latent_factors/10 units
    model.add(Dense(vocab_size)) # output: vocab_size个单词与目标单词的 词向量相似度 的vector
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])    # multi-class classification
    fit = model.fit(trainX, trainY)
    
    
    
    # TODO: 2) Extract weights for hidden layer, set <weights> variable below
    
    weights = None
    
    print('layer_0 dim = ', model.layers[0].get_weights()[0].shape) # get layers[0] weight, get_weights()[1] gets the bias term
    
    print('layer_1 dim = ', model.layers[1].get_weights()[0].shape) # get layers[0] weight, get_weights()[1] gets the bias term
    
    weights = model.layers[0].get_weights()[0]
    
    # Find and print most similar pairs
    similar_pairs = most_similar_pairs(weights, word_to_index)
    for pair in similar_pairs[:30]:
        print(pair)

## 3G:
Run the function below and report your results for dr_seuss.txt.

In [5]:
find_most_similar_pairs('data/dr_seuss.txt', 10)

sample_text length 2071
trainX.shape =  (16548, 308) trainY.shape =  (16548, 308)
Epoch 1/1
layer_0 dim =  (308, 10)
layer_1 dim =  (10, 308)
Pair(them, would), Similarity: 0.9628089
Pair(would, them), Similarity: 0.9628089
Pair(car, them), Similarity: 0.95937026
Pair(like, or), Similarity: 0.957788
Pair(or, like), Similarity: 0.957788
Pair(not, them), Similarity: 0.95743936
Pair(eat, would), Similarity: 0.9547398
Pair(a, eat), Similarity: 0.95205164
Pair(in, not), Similarity: 0.9470372
Pair(i, or), Similarity: 0.9458327
Pair(ned, dear), Similarity: 0.9441935
Pair(dear, ned), Similarity: 0.9441935
Pair(do, a), Similarity: 0.9412332
Pair(eleven, boat), Similarity: 0.93666583
Pair(boat, eleven), Similarity: 0.93666583
Pair(red, oh), Similarity: 0.93665606
Pair(oh, red), Similarity: 0.93665606
Pair(could, in), Similarity: 0.9363972
Pair(things, sing), Similarity: 0.9356929
Pair(sing, things), Similarity: 0.9356929
Pair(open, cans), Similarity: 0.9347308
Pair(cans, open), Similarity: 0.934