In [1]:
import numpy as np
import tensorflow as tf
from q2_initialization import xavier_weight_init
import data_utils.utils as du
import data_utils.ner as ner
from utils import data_iterator
from model import LanguageModel

In [2]:
class Config(object):
  """Holds model hyperparams and data information.

  The config class is used to store various hyperparameters and dataset
  information parameters. Model objects are passed a Config() object at
  instantiation.
  """
  embed_size = 50
  batch_size = 64
  label_size = 5
  hidden_size = 100
  max_epochs = 1
  early_stopping = 2
  dropout = 0.9
  lr = 0.001
  l2 = 0.001  # regularization term
  window_size = 3
config = Config()

In [3]:
wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt')

In [4]:
wv.shape

(100232, 50)

In [5]:
tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
num_to_tag = dict(enumerate(tagnames))
tag_to_num = {v:k for k,v in num_to_tag.iteritems()}

In [6]:
docs = du.load_dataset('data/ner/train')

In [7]:
docs ## docs is a list of sentences-lists of words/types couples.

[[['EU', 'ORG'],
  ['rejects', 'O'],
  ['German', 'MISC'],
  ['call', 'O'],
  ['to', 'O'],
  ['boycott', 'O'],
  ['British', 'MISC'],
  ['lamb', 'O'],
  ['.', 'O']],
 [['Peter', 'PER'], ['Blackburn', 'PER']],
 [['BRUSSELS', 'LOC'], ['1996-08-22', 'O']],
 [['The', 'O'],
  ['European', 'ORG'],
  ['Commission', 'ORG'],
  ['said', 'O'],
  ['on', 'O'],
  ['Thursday', 'O'],
  ['it', 'O'],
  ['disagreed', 'O'],
  ['with', 'O'],
  ['German', 'MISC'],
  ['advice', 'O'],
  ['to', 'O'],
  ['consumers', 'O'],
  ['to', 'O'],
  ['shun', 'O'],
  ['British', 'MISC'],
  ['lamb', 'O'],
  ['until', 'O'],
  ['scientists', 'O'],
  ['determine', 'O'],
  ['whether', 'O'],
  ['mad', 'O'],
  ['cow', 'O'],
  ['disease', 'O'],
  ['can', 'O'],
  ['be', 'O'],
  ['transmitted', 'O'],
  ['to', 'O'],
  ['sheep', 'O'],
  ['.', 'O']],
 [['Germany', 'LOC'],
  ["'s", 'O'],
  ['representative', 'O'],
  ['to', 'O'],
  ['the', 'O'],
  ['European', 'ORG'],
  ['Union', 'ORG'],
  ["'s", 'O'],
  ['veterinary', 'O'],
  ['committ

In [8]:
len(docs), type(docs)

(14042, list)

In [9]:
docs[0]

[['EU', 'ORG'],
 ['rejects', 'O'],
 ['German', 'MISC'],
 ['call', 'O'],
 ['to', 'O'],
 ['boycott', 'O'],
 ['British', 'MISC'],
 ['lamb', 'O'],
 ['.', 'O']]

In [10]:
X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num)

In [11]:
X_train.shape

(203621, 3)

In [12]:
y_train.shape

(203621,)

In [13]:
X_train

array([[   30,  6659, 12637],
       [ 6659, 12637,   445],
       [12637,   445,   977],
       ..., 
       [10797,     0,  2366],
       [    0,  2366,     0],
       [ 2366,     0,    31]])

In [14]:
y_train

array([3, 0, 2, ..., 0, 3, 0])

In [15]:
sess = tf.InteractiveSession()

## placeholders

In [16]:
input_placeholder   = tf.placeholder(tf.int32, [None, config.window_size], name="input_placeholder")
labels_placeholder  = tf.placeholder(tf.int32, [None, config.label_size], name="labels_placeholder")
dropout_placeholder = tf.placeholder(tf.float32, [], name="dropout_placeholder")

## Embedding

In [17]:
L = tf.Variable(tf.random_uniform([len(wv), config.embed_size], -1.0, 1.0), name="L")

In [18]:
L

<tensorflow.python.ops.variables.Variable at 0x11d8c1390>

In [19]:
L.get_shape()

TensorShape([Dimension(100232), Dimension(50)])

In [20]:
tf.nn.embedding_lookup(L, input_placeholder)

<tf.Tensor 'embedding_lookup:0' shape=(?, 3, 50) dtype=float32>

In [21]:
window = tf.reshape(tf.nn.embedding_lookup(L, input_placeholder), [-1, config.window_size * config.embed_size])

In [22]:
window

<tf.Tensor 'Reshape:0' shape=(?, 150) dtype=float32>

## Model

In [23]:
xavier_initializer = xavier_weight_init()

In [36]:
with tf.variable_scope("Layerrr"):
    W = tf.get_variable("W", initializer=xavier_initializer([config.window_size*config.embed_size, config.hidden_size]))
    b1 = tf.get_variable("b1", [config.hidden_size])
    tf.matmul(window, W) + b1

ValueError: Variable Layerrr/W already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:

  File "<ipython-input-35-6afe35404948>", line 2, in <module>
    W = tf.get_variable("W", initializer=xavier_initializer([config.window_size*config.embed_size, config.hidden_size]))
  File "/Users/gibbon/miniconda3/envs/stanford/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/Users/gibbon/miniconda3/envs/stanford/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):


In [38]:
h = tf.tanh(tf.matmul(window, W) + b1)

In [39]:

h

<tf.Tensor 'Tanh:0' shape=(?, 100) dtype=float32>

In [32]:
W.get_shape()

TensorShape([Dimension(150), Dimension(100)])

In [33]:
b1.get_shape()

TensorShape([Dimension(100)])

In [34]:
with tf.variable_scope("Softmax"):
    U = tf.get_variable("U", initializer=xavier_initializer([config.hidden_size, config.label_size]))
    b2 = tf.get_variable("b2", [config.label_size])

In [40]:
output = tf.matmul(h, U) + b2

In [41]:
output

<tf.Tensor 'add_2:0' shape=(?, 5) dtype=float32>

## Loss

In [50]:
y = output

In [51]:
losses = tf.nn.softmax_cross_entropy_with_logits(labels=labels_placeholder, logits=y)

In [52]:
losses

<tf.Tensor 'Reshape_3:0' shape=(?,) dtype=float32>

# Test drive 

In [43]:
sess.run(tf.global_variables_initializer())

In [46]:
sess.run(output, feed_dict={input_placeholder: X_train, labels_placeholder: y_train})

ValueError: Cannot feed value of shape (203621,) for Tensor u'labels_placeholder:0', which has shape '(?, 5)'