In [5]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# Helper libraries
from rnn.w266_common import utils, vocabulary, tf_embed_viz

import rnn.rnnlm as rnnlm;reload(rnnlm)
import rnn.rnnlm_test as rnnlm_test;reload(rnnlm_test)
from nltk.corpus import PlaintextCorpusReader

## RNN

In [6]:
reload(rnnlm)

TF_GRAPHDIR = "../pretrained/rnn_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)

reload(rnnlm); reload(rnnlm_test)
utils.run_tests(rnnlm_test, ["TestRNNLMCore", "TestRNNLMTrain", "TestRNNLMSampler"])

test_shapes_embed (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_output (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_recurrent (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_train (rnn.rnnlm_test.TestRNNLMTrain) ... ok
test_shapes_sample (rnn.rnnlm_test.TestRNNLMSampler) ... ok

----------------------------------------------------------------------
Ran 5 tests in 2.232s

OK


In [7]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        feed_dict = {
            lm.input_w_: w,
            lm.target_y_: y,
            lm.initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, lm.final_h_, train_op]        
        #### YOUR CODE HERE ####
        # session.run(...) the ops with the feed_dict constructed above.
        # Ensure "cost" becomes the value of "loss".
        # Hint: see "ops" for other variables that need updating in this loop.
        
        cost, h, _ = session.run([loss, lm.final_h_ ,train_op], feed_dict=feed_dict)
        
        #### END(YOUR CODE) ####
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} words at {:.1f} wps, loss = {:.3f}".format(
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.rnnlm_batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=0.0, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

In [10]:
# Load the dataset
#V = 10000
#vocab, train_ids, test_ids = utils.load_corpus("brown", split=0.8, V=V, shuffle=42)
corpus_root = '../datasets/training-monolingual.tokenized.shuffled'
wordlists = PlaintextCorpusReader(corpus_root, 'news.en-00001.*')
print(wordlists.fileids())
wordlists.words()


V = 10000
vocab, train_ids, test_ids = utils.load_corpus(wordlists, split=0.8, V=V, shuffle=42)

['news.en-00001-of-00100']
Vocabulary: 10,000 types
Loaded 322,832 sentences (8.16567e+06 tokens)
Training set: 258,265 sentences (6,532,540 tokens)
Test set: 64,567 sentences (1,633,132 tokens)


In [11]:
# Training parameters
max_time = 25
batch_size = 100
learning_rate = 0.01
num_epochs = 10

# Model parameters
model_params = dict(V=vocab.size, 
                    H=200, 
                    softmax_ns=200,
                    num_layers=2)

TF_SAVEDIR = "./rnn/rnn_model"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [12]:
# Will print status every this many seconds
print_interval = 5

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time)
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
        run_epoch(lm, session, bi, learning_rate=learning_rate, train=True, verbose=False, tick_s=10)
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        #print("[epoch {:d}]".format(epoch), end=" ")
        #score_dataset(lm, session, train_ids, name="Train set")
        #print("[epoch {:d}]".format(epoch), end=" ")
        #score_dataset(lm, session, test_ids, name="Test set")
        print("")
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:13:43

[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:15:41

[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:12:47

[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:11:57

[epoch 5] Starting epoch 5
[epoch 5] Completed in 0:11:37

[epoch 6] Starting epoch 6
[epoch 6] Completed in 0:11:17

[epoch 7] Starting epoch 7
[epoch 7] Completed in 0:10:56

[epoch 8] Starting epoch 8
[epoch 8] Completed in 0:10:59

[epoch 9] Starting epoch 9
[epoch 9] Completed in 0:12:15

[epoch 10] Starting epoch 10
[epoch 10] Completed in 0:12:21



In [13]:
def score_seq(lm, session, seq, vocab):
    """Score a sequence of words. Returns total log-probability."""
    padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq + ["</s>"], 
                                                             wordset=vocab.word_to_id))
    w = np.reshape(padded_ids[:-1], [1,-1])
    y = np.reshape(padded_ids[1:],  [1,-1])
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    feed_dict = {lm.input_w_:w,
                 lm.target_y_:y,
                 lm.initial_h_:h,
                 lm.dropout_keep_prob_: 1.0}
    # Return log(P(seq)) = -1*loss
    return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=True):
    """Load the trained model and score the given words."""
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    
    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:  
        # Load the trained model
        saver.restore(session, trained_filename)

        if isinstance(inputs[0], str) or isinstance(inputs[0], bytes):
            inputs = [inputs]

        # Actually run scoring
        results = []
        for words in inputs:
            score = score_seq(lm, session, words, vocab)
            sentence_gen = " ".join(words)
            results.append((score, sentence_gen))

        # Sort if requested
        if sort: results = sorted(results, reverse=True)
            
        return results

In [14]:
sents = ["the boy and the girl is",
         "the boy and the girl are"]
load_and_score([s.split() for s in sents])

INFO:tensorflow:Restoring parameters from ./rnn/rnn_model/rnnlm_trained


[(-7.0964236259460449, 'the boy and the girl is'),
 (-7.1460084915161133, 'the boy and the girl are')]

In [15]:
tf.logging.set_verbosity(tf.logging.ERROR)
output_1 = open('./sentence_generator/output.txt', 'r').readlines()
output_2 = open('./sentence_generator/output_2.txt', 'r').readlines()

idx = 1
per_word = 0

sents_1 = []
sents_2 = []
avg_prob_1,avg_prob_2 = 0, 0
final_result = []
print("rnn_eval : ", len(output_1),' ', end=''),
for i in range (0, len(output_1)):
    split_1 = output_1[i].replace('\n','').split(',')
    split_2 = output_2[i].replace('\n','').split(',')

    sents_1.append(split_1[2])
    sents_1.append(split_2[2].replace(split_2[1],split_1[1]))
    sents_2.append(split_2[2])
    sents_2.append(split_1[2].replace(split_1[1],split_2[1]))

    if (i % 10) == 9:
        print('.', end='')
        score_1 = load_and_score([s.split() for s in sents_1])
        score_2 = load_and_score([s.split() for s in sents_2])
                
        for score in score_1:
            avg_prob_1 += score[0]
        avg_prob_1 = avg_prob_1 / 20
        
        for score in score_2:
            avg_prob_2 += score[0]
        avg_prob_2 = avg_prob_2 / 20
        
        final_result.append([split_1[1], split_2[1], (avg_prob_1-avg_prob_2)])
        
        sents_1.clear()
        sents_2.clear()
        idx += 1
        
print('\nFinished', )

rnn_eval :  9990  ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [16]:
import pandas as pd
import scipy.stats

# load evaluation datsets
path_evalset1 = '../datasets/SimLex-999/SimLex-999.txt'
evalset1 = pd.read_csv(path_evalset1, sep='\t')
evalset1.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


In [17]:
# Evaluating the models scoring on eval datasets

models_score = evalset1.filter(items=['word1', 'word2', 'SimLex999', 'SD(SimLex)'])
models_score = models_score.sort_index(by=['SimLex999'], ascending=False)
models_score = models_score.set_index(pd.Index(range(0,models_score.shape[0])))

models_score.head()

# add the rank column
rank = 0
models_score['rank_simlex'] = models_score.index + 1
print("\nTop and bottom rows of ranked SimLex eval dataset:")
pd.concat([models_score.head(), models_score.tail()], axis=0)


Top and bottom rows of ranked SimLex eval dataset:


  after removing the cwd from sys.path.


Unnamed: 0,word1,word2,SimLex999,SD(SimLex),rank_simlex
0,vanish,disappear,9.8,0.46,1
1,quick,rapid,9.7,1.14,2
2,creator,maker,9.62,1.4,3
3,stupid,dumb,9.58,1.48,4
4,insane,crazy,9.57,0.92,5
994,gun,fur,0.3,1.8,995
995,chapter,tail,0.3,1.57,996
996,dirty,narrow,0.3,0.89,997
997,new,ancient,0.23,0.46,998
998,shrink,grow,0.23,1.2,999


In [21]:
def prob_diff(word_1, word_2):
    pos_word_1 = [i for i,x in enumerate(final_result) if x[0] == word_1] # => [1, 3]
    for idx in pos_word_1:
        if final_result[idx][1] == word_2:
            return abs(final_result[idx][2])
    return -1

prob_diff('vanish','disappear')


0.82890466123126139

In [20]:
models_score['rnn_score'] = models_score.apply(lambda row: prob_diff(row['word1'],row['word2']), axis =1)
models_score.head()

Unnamed: 0,word1,word2,SimLex999,SD(SimLex),rank_simlex,rnn_score
0,vanish,disappear,9.8,0.46,1,0.828905
1,quick,rapid,9.7,1.14,2,0.01237
2,creator,maker,9.62,1.4,3,0.104025
3,stupid,dumb,9.58,1.48,4,0.761379
4,insane,crazy,9.57,0.92,5,0.845486


In [22]:
rnn_argsort = (-models_score['rnn_score']).argsort()   # to sort in descending order
models_score['rank_rnn'] = rnn_argsort + 1
pd.concat([models_score.head(), models_score.tail()], axis=0)

Unnamed: 0,word1,word2,SimLex999,SD(SimLex),rank_simlex,rnn_score,rank_rnn
0,vanish,disappear,9.8,0.46,1,0.828905,116
1,quick,rapid,9.7,1.14,2,0.01237,175
2,creator,maker,9.62,1.4,3,0.104025,202
3,stupid,dumb,9.58,1.48,4,0.761379,113
4,insane,crazy,9.57,0.92,5,0.845486,564
994,gun,fur,0.3,1.8,995,0.703535,89
995,chapter,tail,0.3,1.57,996,0.028255,599
996,dirty,narrow,0.3,0.89,997,0.0617,195
997,new,ancient,0.23,0.46,998,0.497806,255
998,shrink,grow,0.23,1.2,999,0.110749,383


In [23]:
rnn_spearman = scipy.stats.spearmanr(models_score['rank_simlex'], models_score['rank_rnn'])
print('Spearman correlation for word2vec model: {:6.4f} '.format(rnn_spearman[0]) )

Spearman correlation for word2vec model: 0.0300 
