In [7]:
import numpy as np
import tensorflow as tf
import see_rnn
import sys
import pandas as pd

In [10]:
def invert_dict(d):
    return {v:k for k,v in iter(d.items())}


def load_lm_dataset(fname):
    sents = []
    cnt = 0
    with open(fname) as f:
        for line in f:
            if cnt == 0:
                cnt += 1
                continue
            items = line.strip().split('\t')
            sents.append(items[0].split())
    return sents


def load_np_dataset(fname):
    sents = []
    cnt = 0
    with open(fname) as f:
        for line in f:
            if cnt == 0:
                cnt += 1
                continue
            items = line.strip().split('\t')
            verb_idx = int(items[2])
            verb_pos = items[3]
            sent = [verb_pos] + items[0].split()[:verb_idx]
            sents.append(sent)
    return sents


def load_lm_np_dataset(fname):
    sents = []
    cnt = 0
    with open(fname) as f:
        for line in f:
            if cnt == 0:
                cnt += 1
                continue
            items = line.strip().split('\t')
            verb_idx = int(items[2])
            verb = items[4]
            inf_verb = items[5]
            sent = items[0].split()[:verb_idx] + [verb, inf_verb]
            sents.append(sent)
    return sents


def pad_sequence(seq, left=1, right=1):
    return left*["<s>"] + seq + right*["</s>"]


# For RNN
# just convert each sentence to a list of indices
# after padding each with <s> ... </s> tokens
def seq_to_indices(words, word_to_num):
    return np.array([word_to_num[w] for w in words])


def docs_to_indices(sents, word_to_num, pad_left=1, pad_right=1):
    sents = [pad_sequence(s, pad_left, pad_right) for s in sents]
    sents_idx = []
    for sent in sents:
        words = [w if w in word_to_num else 'UNK' for w in sent]
        sents_idx.append(seq_to_indices(words, word_to_num))

    # return as numpy array for fancier slicing
    return np.array(sents_idx, dtype=object)


def offset_seq(seq):
    return seq[:-1], seq[1:]


def offset_np(seq):
    return seq[1:], [seq[0]]


def offset_lm_np(seq):
    return seq[:-2], [seq[-2], seq[-1]]


def seqs_to_lmXY(seqs):
    X, Y = zip(*[offset_seq(s) for s in seqs])
    return np.array(X, dtype=object), np.array(Y, dtype=object)


def seqs_to_npXY(seqs):
    X, Y = zip(*[offset_np(s) for s in seqs])
    return np.array(X, dtype=object), np.array(Y, dtype=object)


def seqs_to_lmnpXY(seqs):
    X, Y = zip(*[offset_lm_np(s) for s in seqs])
    return np.array(X, dtype=object), np.array(Y, dtype=object)

def fraq_loss(vocab, word_to_num, vocabsize):
	fraction_lost = float(sum([vocab['count'][word] for word in vocab.index if (not word in word_to_num) and (not word == "UNK")]))
	fraction_lost /= sum([vocab['count'][word] for word in vocab.index if (not word == "UNK")])
	return fraction_lost

def adjust_loss(loss, fracloss, q, mode='basic'):
	if mode == 'basic':
		# remove freebies only: score if had no UNK
		return (loss + fracloss*np.log(fracloss))/(1 - fracloss)
	else:
		# remove freebies, replace with best prediction on remaining
		return loss + fracloss*np.log(fracloss) - fracloss*np.log(q)

In [18]:
train_size = 10000
dev_size = 1000
vocab_size = 2000

data_folder = "../data/"
np.random.seed(2018)
hdim = 20
lookback = 5
lr = 0.5

# get the data set vocabulary
vocab = pd.read_table(data_folder + "/vocab.wiki.txt", header=None, sep="\s+", index_col=0,
                        names=['count', 'freq'], )
num_to_word = dict(enumerate(vocab.index[:vocab_size]))
word_to_num = invert_dict(num_to_word)

# calculate loss vocabulary words due to vocab_size
fraction_lost = fraq_loss(vocab, word_to_num, vocab_size)
print(
    "Retained %d words from %d (%.02f%% of all tokens)\n" % (
    vocab_size, len(vocab), 100 * (1 - fraction_lost)))

# load training data
sents = load_np_dataset(data_folder + '/wiki-train.txt')
S_train = docs_to_indices(sents, word_to_num, 0, 0)
X_train, D_train = seqs_to_npXY(S_train)

X_train = X_train[:train_size]
Y_train = D_train[:train_size]

# load development data
sents = load_np_dataset(data_folder + '/wiki-dev.txt')
S_dev = docs_to_indices(sents, word_to_num, 0, 0)
X_dev, D_dev = seqs_to_npXY(S_dev)

X_dev = X_dev[:dev_size]
D_dev = D_dev[:dev_size]

Retained 2000 words from 9954 (88.35% of all tokens)

