# Introduction to Deep Learning: Homework 5 & 6

**Nathan Inkawhich**

**[Duke Community Standard](http://integrity.duke.edu/standard.html): By typing your name below, you are certifying that you have adhered to the Duke Community Standard in completing this assignment.**

Name: Nathan Inkawhich

## Problem 2:  Recurrent Neural Networks (30 points)

In [1]:
from urllib.request import urlretrieve
import os
import numpy as np
import h5py
import string
import random
import tensorflow as tf

### Download and format word embeddings

In [2]:
# Word vectors
if not os.path.isfile('mini.h5'):
    print("Downloading Conceptnet Numberbatch word embeddings...")
    conceptnet_url = 'http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5'
    urlretrieve(conceptnet_url, 'mini.h5')
    
# Decode file
with h5py.File('mini.h5', 'r') as f:
    all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]]
    all_embeddings = f['mat']['block0_values'][:]
    
# Extract English words
english_words = [word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices = [i for i, word in enumerate(all_words) if word.startswith('/c/en/')]
english_embedddings = all_embeddings[english_word_indices]

# Normalize Embeddings to unit circle
norms = np.linalg.norm(english_embedddings, axis=1)
normalized_embeddings = english_embedddings.astype('float32') / norms.astype('float32').reshape([-1, 1])

# Create LUT
index = {word: i for i, word in enumerate(english_words)}

In [3]:
def similarity_score(w1, w2):
    score = np.dot(normalized_embeddings[index[w1], :], normalized_embeddings[index[w2], :])
    return score

def print_similarity(w1,w2):
    try:
        print('{0}\t{1}\t'.format(w1,w2), \
          similarity_score('{}'.format(w1), '{}'.format(w2)))
    except:
        print('One of the words is not in the dictionary.')
    return None

In [4]:
# A word is as similar with itself as possible:
print('cat\tcat\t', similarity_score('cat', 'cat'))
# Closely related words still get high scores:
print('cat\tfeline\t', similarity_score('cat', 'feline'))
print('cat\tdog\t', similarity_score('cat', 'dog'))
# Unrelated words, not so much
print('cat\tmoo\t', similarity_score('cat', 'moo'))
print('cat\tfreeze\t', similarity_score('cat', 'freeze'))

cat	cat	 1.0000001
cat	feline	 0.8199548
cat	dog	 0.590724
cat	moo	 0.0039538303
cat	freeze	 -0.030225191


### Prepare movie dataset

In [5]:
remove_punct=str.maketrans('','',string.punctuation)

# This function converts a line of our data file into
# a tuple (x, y), where x is 300-dimensional representation
# of the words in a review, and y is its label.
def convert_line_to_example(line):
    # Pull out the first character: that's our label (0 or 1)
    y = int(line[0])
    # Split the line into words using Python's split() function
    words = line[2:].translate(remove_punct).lower().split()
    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [normalized_embeddings[index[w]] for w in words
                  if w in index]
    # Take the mean of the embeddings
    x = np.mean(np.vstack(embeddings), axis=0)
    return {'x': x, 'y': y, 'w':embeddings}

# Apply the function to each line in the file.
enc = 'utf-8' # This is necessary from within the singularity shell

### Problem 2.1: Train an MLP off of the average word embedding to predict sentiment (as done in class) but optimize the network settings to maximize performance

#### Format Dataset

In [6]:
### Choose Dataset
with open("Data/movie-simple.txt", "r", encoding=enc) as f:
    dataset = [convert_line_to_example(l) for l in f.readlines()]  
#with open("Data/movie-pang02.txt", "r",encoding=enc) as f:
#    dataset = [convert_line_to_example(l) for l in f.readlines()]

print("Length of Dataset: ",len(dataset))
# Shuffle full dataset
random.shuffle(dataset)

# Split full dataset into train/test splits
batch_size = 100
total_batches = len(dataset) // batch_size
train_batches = 3 * total_batches // 4
train, test = dataset[:train_batches*batch_size], dataset[train_batches*batch_size:]

Length of Dataset:  1411


#### Build MLP

In [7]:
### Configs
num_hidden_L1 = 100
num_hidden_L2 = 20
learning_rate = .05
num_epochs = 250

# Clear all old tf graphs
tf.reset_default_graph()

# Placeholders for input
X = tf.placeholder(tf.float32, [None, 300]) # Word embedding size = 300
y = tf.placeholder(tf.float32, [None, 1]) # Binary classification output: "good" or "bad"

# Three-layer MLP
h1 = tf.layers.dense(X, num_hidden_L1, tf.nn.relu)
h2 = tf.layers.dense(h1, num_hidden_L2, tf.nn.relu)
logits = tf.layers.dense(h2, 1)
probabilities = tf.sigmoid(logits)

# Loss and metrics
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(logits)), y), tf.float32))

# Training
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

# Initialization of variables
initialize_all = tf.global_variables_initializer()

#### Train MLP

In [8]:
sess = tf.Session()
sess.run(initialize_all)
for epoch in range(num_epochs):
    for batch in range(train_batches):
        data = train[batch*batch_size:(batch+1)*batch_size]
        reviews = [sample['x'] for sample in data]
        labels  = [sample['y'] for sample in data]
        labels = np.array(labels).reshape([-1, 1])
        _, l, acc = sess.run([train_step, loss, accuracy], feed_dict={X: reviews, y: labels})
    if epoch % 50 == 0:
        print("Epoch", epoch, "Loss", l, "Acc", acc)
    random.shuffle(train)

# Evaluate on test set
test_reviews = [sample['x'] for sample in test]
test_labels  = [sample['y'] for sample in test]
test_labels = np.array(test_labels).reshape([-1, 1])
acc = sess.run(accuracy, feed_dict={X: test_reviews, y: test_labels})
print("Final test accuracy:", acc)

Epoch 0 Loss 0.690119 Acc 0.55
Epoch 50 Loss 0.60457414 Acc 0.71
Epoch 100 Loss 0.23702054 Acc 0.93
Epoch 150 Loss 0.20047058 Acc 0.91
Epoch 200 Loss 0.11904577 Acc 0.97
Final test accuracy: 0.9537713


In [9]:
sess.close()

### Problem 2.2:  Train a RNN from the word embeddings to predict sentiment (as done in class) and optimize the network settings to maximize performance

#### Format Dataset

In [10]:
### Choose Dataset
with open("Data/movie-simple.txt", "r", encoding=enc) as f:
    dataset = [convert_line_to_example(l) for l in f.readlines()]  
#with open("Data/movie-pang02.txt", "r",encoding=enc) as f:
#    dataset = [convert_line_to_example(l) for l in f.readlines()]

print("Length of Dataset: ",len(dataset))  
random.shuffle(dataset)
batch_size = 1
total_batches = len(dataset) // batch_size
train_batches = 3 * total_batches // 4
train, test = dataset[:train_batches*batch_size], dataset[train_batches*batch_size:]

Length of Dataset:  1411


#### Build RNN Model

In [11]:
# Clear old tf stuff
tf.reset_default_graph()

# Configs
n_steps = None
n_inputs = 300
n_neurons = 50
num_epochs = 1

# Input placeholders
X= tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y= tf.placeholder(tf.float32, [None, 1])

# Build RNN
basic_cell = tf.contrib.rnn.BasicRNNCell(n_neurons,activation=tf.nn.tanh)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)
last_cell_output=outputs[:,-1,:]
y_=tf.layers.dense(last_cell_output,1)

# Loss and metrics
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_, labels=y))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_)), y), tf.float32))

# Training
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)

#### Train RNN

In [12]:
initialize_all = tf.global_variables_initializer()
sess = tf.Session()
sess.run(initialize_all)
l_ma=.74
acc_ma=.5
for epoch in range(num_epochs):
    for batch in range(train_batches):
        data = train[batch*batch_size:(batch+1)*batch_size]
        reviews = np.array([sample['w'] for sample in data]).reshape([1,-1,300])
        labels  = np.array([sample['y'] for sample in data]).reshape([1,1])
        labels = np.array(labels).reshape([-1, 1])
        _, l, acc = sess.run([train_step, loss, accuracy], feed_dict={X: reviews, y: labels})
        l_ma=.99*l_ma+(.01)*l
        acc_ma=.99*acc_ma+(.01)*acc
        if (batch+1) % 100 == 0:
            print("batch", batch, "Loss", l_ma, "Acc", acc_ma)
    if epoch % 1 == 0:
        print("Epoch", epoch, "Loss", l_ma, "Acc", acc_ma)
    random.shuffle(train)

batch 99 Loss 0.6973456715211871 Acc 0.5231646095566507
batch 199 Loss 0.6073786472962632 Acc 0.6580323704982015
batch 299 Loss 0.4412638618766594 Acc 0.7960813802179973
batch 399 Loss 0.37871248174390876 Acc 0.8394178290795372
batch 499 Loss 0.4126086860810042 Acc 0.8151977302386081
batch 599 Loss 0.42612175790679896 Acc 0.7904779924002064
batch 699 Loss 0.3922986187270466 Acc 0.8279460272620374
batch 799 Loss 0.38161134662699775 Acc 0.8336617848638171
batch 899 Loss 0.3739993986912681 Acc 0.8160768972219207
batch 999 Loss 0.3222696428076551 Acc 0.8648031316807501
Epoch 0 Loss 0.3448121127542254 Acc 0.8502255011363107


In [13]:
# Evaluate on test set
test_acc=0
n=0
for sample in test:
    test_reviews = np.array([sample['w'] ]).reshape([1,-1,300])
    test_labels  = np.array([sample['y']]).reshape([1,1])
    test_labels = np.array(test_labels).reshape([-1, 1])
    test_acc += sess.run(accuracy, feed_dict={X: test_reviews, y: test_labels})
    n+=1
acc=test_acc/n 
print("Final accuracy:", acc)


Final accuracy: 0.8951841359773371


In [14]:
sess.close()

### Problem 2.3:  Encode each vocabulary word as a one-hot vector. Train an MLP on the average of the onehot vectors.

#### Build one hot embedding functionality

In [15]:
print("len(english_words): ", len(english_words))
print("english_embedddings.shape: ", english_embedddings.shape)

# Build onehot encoding scheme with an identity matrix
onehot_embeddings = np.identity(len(english_words),dtype=np.float32)
print("onehot_embeddings.shape: ", onehot_embeddings.shape)
#print(np.sum(onehot_embeddings,axis=0))
#print(np.sum(onehot_embeddings,axis=1))

# Create LUT
index = {word: i for i, word in enumerate(english_words)}
print("Size of index dict: ", len(index.keys()))

remove_punct=str.maketrans('','',string.punctuation)

# This function converts a line of our data file into
# a tuple (x, y), where x is 150875-dimensional one-hot representation
# of the words in a review, and y is its label.
def convert_line_to_example_onehot(line):
    # Pull out the first character: that's our label (0 or 1)
    y = int(line[0])
    # Split the line into words using Python's split() function
    words = line[2:].translate(remove_punct).lower().split()
    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [onehot_embeddings[index[w]] for w in words if w in index]
    # Take the mean of the embeddings
    x = np.mean(np.vstack(embeddings), axis=0)
    return {'x': x, 'y': y, 'w':embeddings}

# Apply the function to each line in the file.
enc = 'utf-8' # This is necessary from within the singularity shell

len(english_words):  150875
english_embedddings.shape:  (150875, 300)
onehot_embeddings.shape:  (150875, 150875)
Size of index dict:  150875


#### Create train/test datasets for MLP

In [16]:
### Choose Dataset
with open("Data/movie-simple.txt", "r", encoding=enc) as f:
    dataset = [convert_line_to_example_onehot(l) for l in f.readlines()]  
#with open("Data/movie-pang02.txt", "r",encoding=enc) as f:
#    dataset = [convert_line_to_example_onehot(l) for l in f.readlines()]

print("Length of Dataset: ",len(dataset))

# Split full dataset into train/test splits
random.shuffle(dataset)
batch_size = 100
total_batches = len(dataset) // batch_size
train_batches = 3 * total_batches // 4
train, test = dataset[:train_batches*batch_size], dataset[train_batches*batch_size:]

print("# train: ", len(train))
print("# test: ", len(test))

Length of Dataset:  1411
# train:  1000
# test:  411


#### Build MLP

In [17]:
### Configs
num_hidden_L1 = 100
num_hidden_L2 = 20
learning_rate = .05
num_epochs = 50

# Clear all old tf graphs
tf.reset_default_graph()

# Placeholders for input
X = tf.placeholder(tf.float32, [None, 150875]) # Word embedding size = 150875
y = tf.placeholder(tf.float32, [None, 1]) # Binary classification output: "good" or "bad"

# Three-layer MLP
h1 = tf.layers.dense(X, num_hidden_L1, tf.nn.relu)
h2 = tf.layers.dense(h1, num_hidden_L2, tf.nn.relu)
logits = tf.layers.dense(h2, 1)
probabilities = tf.sigmoid(logits)

# Loss and metrics
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(logits)), y), tf.float32))

# Training
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

# Initialization of variables
initialize_all = tf.global_variables_initializer()

#### Train MLP

In [18]:
sess = tf.Session()
sess.run(initialize_all)
for epoch in range(num_epochs):
    for batch in range(train_batches):
        data = train[batch*batch_size:(batch+1)*batch_size]
        reviews = [sample['x'] for sample in data]
        labels  = [sample['y'] for sample in data]
        labels = np.array(labels).reshape([-1, 1])
        _, l, acc = sess.run([train_step, loss, accuracy], feed_dict={X: reviews, y: labels})
    if epoch % 10 == 0:
        print("Epoch", epoch, "Loss", l, "Acc", acc)
    random.shuffle(train)

# Evaluate on test set
test_reviews = [sample['x'] for sample in test]
test_labels  = [sample['y'] for sample in test]
test_labels = np.array(test_labels).reshape([-1, 1])
print("Test Review Data Shape: ",np.array(test_reviews).shape)
acc = sess.run(accuracy, feed_dict={X: test_reviews, y: test_labels})
print("Final test accuracy:", acc)

Epoch 0 Loss 0.69034976 Acc 0.62
Epoch 10 Loss 0.6914494 Acc 0.52
Epoch 20 Loss 0.6822378 Acc 0.56
Epoch 30 Loss 0.6873419 Acc 0.51
Epoch 40 Loss 0.6727189 Acc 0.55
Test Review Data Shape:  (411, 150875)
Final test accuracy: 0.54257905


In [19]:
sess.close()

### Problem 2.4:  Encode each vocabulary word as a one-hot vector. Train RNN on the one-hot encodings.

#### Create train/test datasets for MLP

In [20]:
### Choose Dataset
with open("Data/movie-simple.txt", "r", encoding=enc) as f:
    dataset = [convert_line_to_example_onehot(l) for l in f.readlines()]  
#with open("Data/movie-pang02.txt", "r",encoding=enc) as f:
#    dataset = [convert_line_to_example_onehot(l) for l in f.readlines()]

print("Length of Dataset: ",len(dataset))

# Split full dataset into train/test splits
random.shuffle(dataset)
batch_size = 1
total_batches = len(dataset) // batch_size
train_batches = 3 * total_batches // 4
train, test = dataset[:train_batches*batch_size], dataset[train_batches*batch_size:]

print("# train: ", len(train))
print("# test: ", len(test))

Length of Dataset:  1411
# train:  1058
# test:  353


#### Build RNN Model

In [21]:
# Clear old tf stuff
tf.reset_default_graph()

# Configs
n_steps = None
n_inputs = 150875
n_neurons = 50
num_epochs = 1

# Input placeholders
X= tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y= tf.placeholder(tf.float32, [None, 1])

# Build RNN
basic_cell = tf.contrib.rnn.BasicRNNCell(n_neurons,activation=tf.nn.tanh)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)
last_cell_output=outputs[:,-1,:]
y_=tf.layers.dense(last_cell_output,1)

# Loss and metrics
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_, labels=y))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_)), y), tf.float32))

# Training
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)

#### Train RNN on one-hot encoded movie data

In [22]:
initialize_all = tf.global_variables_initializer()
sess = tf.Session()
sess.run(initialize_all)
l_ma=.74
acc_ma=.5
for epoch in range(num_epochs):
    for batch in range(train_batches):
        data = train[batch*batch_size:(batch+1)*batch_size]
        reviews = np.array([sample['w'] for sample in data]).reshape([1,-1,150875]) # New dims
        labels  = np.array([sample['y'] for sample in data]).reshape([1,1])
        labels = np.array(labels).reshape([-1, 1])
        _, l, acc = sess.run([train_step, loss, accuracy], feed_dict={X: reviews, y: labels})
        l_ma=.99*l_ma+(.01)*l
        acc_ma=.99*acc_ma+(.01)*acc
        if (batch+1) % 100 == 0:
            print("batch", batch, "Loss", l_ma, "Acc", acc_ma)
    if epoch % 1 == 0:
        print("Epoch", epoch, "Loss", l_ma, "Acc", acc_ma)
    random.shuffle(train)

batch 99 Loss 0.7078109203179662 Acc 0.5312151174752224
batch 199 Loss 0.6752613271450638 Acc 0.5977265225273367
batch 299 Loss 0.6445762745594524 Acc 0.6285593617972456
batch 399 Loss 0.5944456793475554 Acc 0.6732697116427385
batch 499 Loss 0.5965433553075045 Acc 0.6741323217162326
batch 599 Loss 0.5690051064577933 Acc 0.6893543298962496
batch 699 Loss 0.5372331624982251 Acc 0.751097568804388
batch 799 Loss 0.5039331582726344 Acc 0.8341286004031883
batch 899 Loss 0.34372292073487765 Acc 0.8899887649158932
batch 999 Loss 0.3288086386782088 Acc 0.8999821138804149
Epoch 0 Loss 0.34232794468014033 Acc 0.8771517269459184


#### Test trained RNN on one-hot encoded test data

In [23]:
# Evaluate on test set
test_acc=0
n=0
for sample in test:
    test_reviews = np.array([sample['w'] ]).reshape([1,-1,150875])
    test_labels  = np.array([sample['y']]).reshape([1,1])
    test_labels = np.array(test_labels).reshape([-1, 1])
    test_acc += sess.run(accuracy, feed_dict={X: test_reviews, y: test_labels})
    n+=1
acc=test_acc/n 
print("Final test accuracy:", acc)

Final test accuracy: 0.8781869688385269


In [24]:
sess.close()

### Problem 2.5: Why did the word embeddings work better (hint: the word embeddings will work better…)

### Problem 2.6: How does cross-validation change when considering a time-series instead of multiple instances (as in our movie reviews)? Only a description is needed.

### Problem 2.7: In our previous homework assignment we considered the conditional GAN. In that case, the conditional label was known and given. Instead, consider generating images to match text. One approach could be to use an RNN to encode text to a vector that is fed to a conditional GAN (e.g. http://proceedings.mlr.press/v48/reed16.pdf). Draw a graph (but do not implement) how such a system could work. Any implementation here is completely optional, we are only looking for a description of how this could work.