## Task description

In [1]:
from IPython.display import IFrame
IFrame("./assignment5.pdf", width=800, height=1600)

## Imports and helpers

In [3]:
import tensorflow as tf
import numpy as np

from collections import Counter

import datetime
import os
import pickle
import time

In [4]:
def clean_string(string):
    return string.lower()

In [40]:
def load_data_and_labels(data_file_path, max_vocabSize, past_words):
    """
    Loads training data, creates vocabulary and returns the respective ids for words and tags
    """
    # Load data from file
    cwd = os.getcwd()
    # Collect word counts and unique PoS tags
    word_counts = Counter()
    unique_posTags = set()
    with open(cwd+data_file_path, "r") as tagged_sentences:
        for sentence in tagged_sentences:
            for tag in sentence.strip().split(" "):
                splitted_tag = tag.split("/")
                if len(splitted_tag) != 2:
                    continue
                word = clean_string(splitted_tag[0])
                pos = splitted_tag[1]
                unique_posTags.add(pos) # collect all unique PoS tags
                if word in word_counts: # collect word frequencies (used later to prune vocabulary)
                    word_counts[word] += 1
                else:
                    word_counts[word] = 1
    # Prune vocabulary to max_vocabSize
    words_toKeep = [tupl[0] for tupl in word_counts.most_common(max_vocabSize-1)]
    # Create mapping from words/PoS tags to ids
    word_toId = {word: i for i, word in enumerate(words_toKeep, 1)}
    word_toId["<UNK>"] = 0 # add unknown token to vocabulary (all words not contained in it will be mapped to this)
    pos_toId = {pos: i for i, pos in enumerate(list(unique_posTags))}
    # Save vocabulary and PoS tags ids for evaluation
    if not os.path.exists(cwd+"/vocab"):
        os.makedirs(cwd+"/vocab")
    with open(cwd+"/vocab/wordIds.pkl", "wb") as f:
        pickle.dump(word_toId, f)
    with open(cwd+"/vocab/posIds.pkl", "wb") as f:
        pickle.dump(pos_toId, f)
    # Replace each word with the id of the previous "past_words" words
    # and replace each PoS tag by its respective id
    x = []
    y = []
    with open(cwd+data_file_path, "r") as tagged_sentences:
        for sentence in tagged_sentences:
            pairs = sentence.strip().split(" ")
            words_and_tags = list(pair.split("/") for pair in pairs if len(pair.split("/")) == 2)
            if len(words_and_tags) == 0:
                continue
            words, pos_tags = zip(*words_and_tags)
            words = [clean_string(word) for word in words]
            for j in range(len(words)):
                y.append(pos_toId[ pos_tags[j] ])
                pastWords_ids = []
                for k in range(0, past_words+1): # for previous words
                    if j-k < 0: # out of bounds
                        pastWords_ids.append(0) # <UNK>
                    elif words[j-k] in word_toId: # word in vocabulary
                        pastWords_ids.append(word_toId[ words[j-k] ])
                    else: # word not in vocabulary
                        pastWords_ids.append(0) # <UNK>	
                x.append(pastWords_ids)

    return [np.array(x), np.array(y), len(unique_posTags)]

In [6]:
def load_data_and_labels_test(data_file_path, past_words):
    """
    Loads test data and vocabulary and returns the respective ids for words and tags
    """
    cwd = os.getcwd()

    # Load vocabulary and PoS tags ids from training
    if not os.path.exists(cwd+"/vocab"):
        raise FileNotFoundError("You need to run train.py first in order to generate the vocabulary.")
    with open(cwd+"/vocab/wordIds.pkl", "rb") as f:
        word_toId = pickle.load(f)
    with open(cwd+"/vocab/posIds.pkl", "rb") as f:
        pos_toId = pickle.load(f)
    # Replace each word with the id of the previous "past_words" words
    # and replace each PoS tag by its respective id
    x = []
    y = []
    with open(cwd+data_file_path, "r") as tagged_sentences:
        for sentence in tagged_sentences:
            pairs = sentence.strip().split(" ")
            words, pos_tags = zip(*(pair.split("/") for pair in pairs if len(pair.split("/")) == 2))
            for j in range(len(words)): # for each word in the sentence
                if pos_tags[j] in pos_toId: 
                    y.append(pos_toId[ pos_tags[j] ])
                else:
                    y.append(0) # TODO: This is not correct, but we should have seen all posible output tags in advance...
                pastWords_ids = []
                for k in range(1, past_words+1): # for previous words
                    if j-k < 0: # out of bounds
                        pastWords_ids.append(0) # <UNK>
                    elif words[j-k] in word_toId: # word in vocabulary
                        pastWords_ids.append(word_toId[ words[j-k] ])
                    else: # word not in vocabulary
                        pastWords_ids.append(0) # <UNK>	
                x.append(pastWords_ids)

    return [np.array(x), np.array(y)]

In [7]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

## PosTagger: Assignment

In [26]:
class PoSTagger(object):
    """
    A simple PoS tagger implementation in Tensorflow.
    Uses an embedding layer followed by a fully connected layer with ReLU and a softmax layer.
    """
    def __init__(self, num_classes, vocab_size, embedding_size, past_words): # sequence_length, filter_sizes, num_filters, l2_reg_lambda=0.0
        # Minibatch placeholders for input and output
        # The word indices of the window
        self.input_x = tf.placeholder(tf.int32, [None, past_words+1], name="input_x")
        # The target pos-tags
        self.input_y = tf.placeholder(tf.int64, [None], name="input_y") 

        with tf.device('/gpu:0'):

            # Embedding layer
            with tf.name_scope("embedding"):
                # Create an embedding matrix: |vocab_size x embedding_size|
                embedding_matrix = tf.get_variable('embedding_matrix',
                   shape=[vocab_size, embedding_size],
                   initializer=tf.random_uniform_initializer(minval=-1, maxval=1))
                
                # Create word embeddings tensor with embedding matrix lookup:
                # |None x past_words x embedding_size|
                embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                         self.input_x,
                                                         name='embedding_lookup')
                
                # Create feature vector:
                # Flatten out embeddings tensor to get e/a input sequence on row:
                # |None x (past_words+1)*embedding_size|
                x_flat_dim = (past_words+1)*embedding_size
                self.x_flat = tf.reshape(embeddings, [-1, x_flat_dim])
                
            # Fully connected layer with ReLU 
            with tf.name_scope("model"):
                # Set initializer handle for readability below
                xavi = tf.contrib.layers.xavier_initializer()
                
                # Set size d' for (first) hidden layer: Heuristic guess
                d_prime = embedding_size
                
                # Create/get weight matrix and bias vector for hidden layer
                W_1 = tf.get_variable('W_1', [x_flat_dim, d_prime], initializer=xavi)
                b_1 = tf.get_variable('b_1', [d_prime], initializer=tf.zeros_initializer)
                
                # Send feature vector through hidden layer
                out_1 = tf.nn.relu(tf.nn.xw_plus_b(self.x_flat, W_1, b_1))
                
                # Compute softmax logits :
                # Create/get weight matrix and bias vector for softmax layer
                W_softmax = tf.get_variable('W_softmax', [d_prime, num_classes], initializer=xavi)
                b_softmax = tf.get_variable('b_softmax', [num_classes], initializer=tf.zeros_initializer)
                
                # Calculate logits
                self.logits = tf.nn.xw_plus_b(out_1, W_softmax, b_softmax)
                
                # Apply softmax on logits to get predicted probabilities
                self.predictions = tf.nn.softmax(self.logits, name='predictions')
                
                # Find most probable prediction along axis=1
                self.best_prediction = tf.argmax(self.predictions, 1, name='best_prediction')
                
                # Compute the mean loss using tf.nn.sparse_softmax_cross_entropy_with_logits
                self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.input_y))

            # Calculate accuracy
            with tf.name_scope("accuracy"):
                # TODO compute the average accuracy over the batch (remember tf.argmax and tf.equal)
                self.correct_predictions = tf.cast(tf.equal(self.best_prediction,
                                                            self.input_y),
                                                   'float')
                self.accuracy = tf.reduce_mean(self.correct_predictions, name='accuracy')


## Training

In [None]:
class ConfigTrain(object):
    """
    Replacement class for training flags abomination originally given
    """
    def __init__(self,
                 dev_sample_percentage=.1,
                 data_file_path='/data/corpus.small',
                 embedding_dim=128,
                 vocab_size=50000,
                 past_words=3,
                 batch_size=64,
                 num_epochs=200,
                 evaluate_every=100,
                 checkpoint_every=100,
                 num_checkpoints=5,
                 allow_soft_placement=True,
                 log_device_placement=False):
        
        # Percentage of the training data used for validation (default: 10%)
        self.dev_sample_percentage=dev_sample_percentage
        
        # Path to the training data 
        self.data_file_path=data_file_path
        
        # Dimensionality of word embeddings (default: 128)
        self.embedding_dim=embedding_dim
        
        # Size of the vocabulary (default: 50k)
        self.vocab_size=vocab_size
        
        # How many previous words are used for prediction (default: 3)
        self.past_words=past_words
        
        # Batch Size (default: 64)
        self.batch_size=batch_size
        
        # Number of training epochs (default: 200)
        self.num_epochs=num_epochs
        
        # Evaluate model on dev set after this many steps (default: 100)
        self.evaluate_every=evaluate_every
        
        # Save model after this many steps (default: 100)
        self.checkpoint_every=checkpoint_every
        
        # Number of checkpoints to store (default: 5)
        self.num_checkpoints=num_checkpoints
        
        # Allow device soft device placement
        self.allow_soft_placement=allow_soft_placement
        
        # Log placement of ops on devices
        self.log_device_placement=log_device_placement
        

In [41]:
## DATA PREPARATION ##

CONFIG = ConfigTrain()

# Load data
print("Loading and preprocessing training and dev datasets \n")
x, y, num_outputTags = load_data_and_labels(CONFIG.data_file_path, CONFIG.vocab_size, CONFIG.past_words)

# Randomly shuffle data
np.random.seed(10)
shuffled_indices = np.random.permutation(len(y))
x_shuffled = x[shuffled_indices]
y_shuffled = y[shuffled_indices]

# Split train/dev sets
dev_sample_index = -1 * int(CONFIG.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

# Generate training batches
batches = batch_iter(list(zip(x_train, y_train)), CONFIG.batch_size, CONFIG.num_epochs)
print("Done \n")

## MODEL AND TRAINING PROCEDURE DEFINITION ##

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=CONFIG.allow_soft_placement,
        log_device_placement=CONFIG.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Initialize model
        pos_tagger = PoSTagger(
            num_classes=num_outputTags, 
            vocab_size=CONFIG.vocab_size, 
            embedding_size=CONFIG.embedding_dim, 
            past_words=CONFIG.past_words
        )

        # Define training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        
        # Define an optimizer, e.g. AdamOptimizer
        optimizer = tf.train.AdamOptimizer()
        
        # Define an optimizer step
        train_op = optimizer.minimize(pos_tagger.loss, global_step=global_step)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", pos_tagger.loss)
        acc_summary = tf.summary.scalar("accuracy", pos_tagger.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory (Tensorflow assumes this directory already exists so we need to create it)
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=CONFIG.num_checkpoints)

        # Initialize all variables
        sess.run(tf.global_variables_initializer())
        sess.graph.finalize()

        # Define training and dev steps (batch) 
        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            start = time.time()
            feed_dict = {
                pos_tagger.input_x: x_batch,
                pos_tagger.input_y: y_batch
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, pos_tagger.loss, pos_tagger.accuracy],
                feed_dict)
            stop = time.time()
            time_str = datetime.datetime.now().isoformat()
            print("Training step: {:4d} \t loss: {:3f} \t accuracy: {:3f} \t {:3f} sec/batch".format(step, loss, accuracy, (stop-start)))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
                pos_tagger.input_x: x_batch,
                pos_tagger.input_y: y_batch
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, pos_tagger.loss, pos_tagger.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("Evaluation step: {:4d} \t loss: {:3f} \t accuracy: {:3f}".format(step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        ## TRAINING LOOP ##
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % CONFIG.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
            if current_step % CONFIG.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

Loading and preprocessing training and dev datasets 

Done 

Writing to /Users/z7717/Dropbox/Git/OwnRepos/18F-NLU/assignment_4/runs/1521563490

Training step:    1 	 loss: 3.918804 	 accuracy: 0.000000 	 0.089248 sec/batch
Training step:    2 	 loss: 3.664735 	 accuracy: 0.000000 	 0.042374 sec/batch
Training step:    3 	 loss: 3.529639 	 accuracy: 0.140625 	 0.043224 sec/batch
Training step:    4 	 loss: 3.602353 	 accuracy: 0.062500 	 0.042690 sec/batch
Training step:    5 	 loss: 3.440485 	 accuracy: 0.140625 	 0.043103 sec/batch
Training step:    6 	 loss: 3.375297 	 accuracy: 0.093750 	 0.043877 sec/batch
Training step:    7 	 loss: 3.486463 	 accuracy: 0.093750 	 0.064447 sec/batch
Training step:    8 	 loss: 3.252808 	 accuracy: 0.140625 	 0.042779 sec/batch
Training step:    9 	 loss: 3.359285 	 accuracy: 0.203125 	 0.052125 sec/batch
Training step:   10 	 loss: 3.064720 	 accuracy: 0.250000 	 0.046288 sec/batch
Training step:   11 	 loss: 3.037189 	 accuracy: 0.296875 	 0.0678

KeyboardInterrupt: 

## Testing

In [42]:
class ConfigTest(object):
    """
    Replacement class for testing flags abomination originally given
    """
    def __init__(self,
                 data_file_path='/data/corpus.small',
                 past_words=3,
                 batch_size=64,
                 checkpoint_dir='./runs/xxxxx/checkpoints/',
                 allow_soft_placement=True,
                 log_device_placement=False):
        
        # Path to the test data 
        self.data_file_path=data_file_path
        
        # How many previous words are used for prediction (default: 3)
        self.past_words=past_words
        
        # Batch Size (default: 64)
        self.batch_size=batch_size
        
        # Checkpoint directory from training run
        self.checkpoint_dir=checkpoint_dir
        
        # Allow device soft device placement
        self.allow_soft_placement=allow_soft_placement
        
        # Log placement of ops on devices
        self.log_device_placement=log_device_placement

In [43]:
## DATA PREPARATION ##

CONFIG = ConfigTest()

# Load data
print("Loading and preprocessing test dataset \n")
x_test, y_test = load_data_and_labels_test(CONFIG.data_file_path, CONFIG.past_words)

## EVALUATION ##

checkpoint_file = tf.train.latest_checkpoint(CONFIG.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=CONFIG.allow_soft_placement,
        log_device_placement=CONFIG.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("accuracy").outputs[0]

        # Generate batches for one epoch
        batches = data_utils.batch_iter(list(x_test), CONFIG.batch_size, 1, shuffle=False)

        # Collect the predictions here
        all_predictions = []

        for x_test_batch in batches:
            batch_predictions = sess.run(predictions, {input_x: x_test_batch})
            all_predictions = np.concatenate([all_predictions, batch_predictions])

# Print accuracy
correct_predictions = float(sum(all_predictions == y_test))
print("Total number of test examples: {}".format(len(y_test)))
print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))

Loading and preprocessing test dataset 



OSError: File None.meta does not exist.