# Convolutional Neural Networks for Sentence Classification
### Yoon Kim
http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import string
import nltk
import itertools
from collections import Counter
from math import ceil

> Let $x_i ∈ R^k$ be the $k-dimensional$ word vector corresponding to the $i^{th} word$ in the sentence. A sentence of length $n$ (padded where necessary) is represented as $$x_{1:n} = x_1 ⊕ x_2 ⊕ . . . ⊕ x_n$$ where $⊕$ is the concatenation operator.



In [3]:
sentences = open("data/rt-polarity.pos", 'r').readlines() + open("data/rt-polarity.neg", 'r').readlines()

In [4]:
def get_chars(sentences, func=lambda x: x):
    letters = Counter()
    for sentence in sentences:
        for ch in func(sentence):
            if not ch.isspace():
                letters[ch] += 1
    return sorted(letters.keys())

In [5]:
print(get_chars(sentences))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '[', ']', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x85', '\x91', '\x96', '\x97', '\xbd', '\xc9', '\xda', '\xe0', '\xe1', '\xe2', '\xe3', '\xe6', '\xe7', '\xe8', '\xe9', '\xea', '\xed', '\xef', '\xf1', '\xf3', '\xf4', '\xf5', '\xf6', '\xfa', '\xfb', '\xfc']


Each word should have alphanumeric and punctuation chars. Figure 1 tells us that we should treat contraction words separately

In [6]:
def remove_unwanted_chars(sentence):
    sentence = re.sub("[^A-Za-z0-9,!?\'()]", " ", sentence)
    sentence = ' '.join(nltk.word_tokenize(sentence))
    return sentence

In [7]:
print(get_chars(sentences, func=remove_unwanted_chars))

['!', "'", '(', ')', ',', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


get the max length of all the sentences and add padding for sentences with length less than max length

In [8]:
positive_sentences = map(lambda x: x.strip(), open("data/rt-polarity.pos", "rb").readlines())
negative_sentences = map(lambda x: x.strip(), open("data/rt-polarity.neg", "rb").readlines())
positive_labels = [[0, 1]] * len(positive_sentences)
negative_labels = [[1, 0]] * len(negative_sentences)

In [9]:
X = map(lambda x: remove_unwanted_chars(x).split(), positive_sentences + negative_sentences)
y = np.array(positive_labels + negative_labels)

In [10]:
max_length = max(len(x) for x in X)
X_padded = []
for x in X:
    length = len(x)
    padding = max_length - length
    x = x + ["<NULL>"] * padding
    X_padded.append(x)
    
X = X_padded

> We use the publicly available word2vec vectors that were trained on 100 billion words from Google
News. The vectors have dimensionality of 300 and were trained using the continuous bag-of-words architecture (Mikolov et al., 2013). Words not present in the set of pre-trained words are initialized randomly.

> **CNN-rand**: Our baseline model where all words are randomly initialized and then modified during training.

> **CNN-staitc**: A model with pre-trained vectors from word2vec. All words—including the unknown ones that are randomly initialized—are kept static and only the other parameters of the model are learned.

> When randomly initializing words not in word2vec, we obtained slight improvements by sampling each dimension from
$U [−a, a]$ where $a$ was chosen such that the randomly initialized vectors have the same variance as the pre-trained ones.

Things to do:
1. Create a vocabulary from all the sentences
2. Create word vectors using word2vec (use pre-trained googlenews-vectors-negative300.bin)
3. Find variance of word in vocab present in word2vec and create randomly initialized vectors for words not present in word2vec.

In [11]:
vocabulary = Counter(itertools.chain(*X)).keys()
vocabulary_map = {word: i for i, word in enumerate(vocabulary)}

In [12]:
def get_word_vectors(vocab):
    word_vecs = {}
    from gensim.models import Word2Vec
    model = Word2Vec.load_word2vec_format("data/GoogleNews-vectors-negative300.bin")
    for word in vocab:
        if word in model:
            word_vecs[word] = model[word]
    return word_vecs

In [13]:
wordvectors = get_word_vectors(vocabulary_map)

In [14]:
std = np.std(np.array(wordvectors.values()))
print("N: " + str(len(X)))
print("V: " + str(len(vocabulary)))
print("V_pre: " + str(len(wordvectors)))
print("std: " + str(std))

N: 10662
V: 18588
V_pre: 16461
std: 0.177474


In [15]:
def build_matrix(sentences, vocab, wordvector, std):
    dataset_size = len(sentences)
    sentence_length = len(sentences[0])
    k = 300
    matrix = np.zeros(shape=(dataset_size, sentence_length, k), dtype=np.float32)
    for i, sentence in enumerate(sentences):
        for j, word in enumerate(sentence):
            if word == '<NULL>':
                matrix[i][j] = np.zeros(k, dtype=np.float32)
            if word in wordvector:
                matrix[i][j] = wordvector[word]
            else:
                matrix[i][j] = np.random.uniform(std, -std, k)
    return matrix

### Hyperparameters

> For all datasets we use: rectified linear units, filter
windows $(h)$ of $3, 4, 5$ with $100$ feature maps each,
dropout rate $(p)$ of $0.5$, $l_2$ constraint $(s)$ of $3$, and
mini-batch size of $50$. 



> Training is done through stochastic gradient descent over shuffled mini-batches with the
Adadelta update rule (Zeiler, 2012).


In [16]:
relu = tf.nn.relu
h = [3, 4, 5]
p = 0.5
s = 3
batch_size = 50
Adadelta = tf.train.AdadeltaOptimizer
feature_map = 100

## Model: CNN-static

In [17]:
n = len(X[0])
k = 300
n_classes = 2

In [18]:
input_x = tf.placeholder(tf.float32, shape=(None, n, k))
input_y = tf.placeholder(tf.float32, shape=(None, n_classes))

In [19]:
expand_input_x = tf.expand_dims(input_x, -1)

Convolutional layer with filter window size 3, 4, 5 and then max-over-time pooling

In [20]:
def weight_variable(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1))

def bias_variable(shape):
    return tf.Variable(tf.constant(0.1, shape=shape))

In [21]:
layers = []
for filter_size in h:
    filter_shape = [filter_size, k, 1, feature_map]
    W = weight_variable(filter_shape)
    b = bias_variable([feature_map])
    conv = relu(tf.nn.bias_add(tf.nn.conv2d(expand_input_x, W, strides=[1, 1, 1, 1], padding="VALID"),
                          b))
    max_pool = tf.nn.max_pool(conv, ksize=[1, n-filter_size+1, 1, 1], strides=[1, 1, 1, 1], padding="VALID")
    layers.append(max_pool)

In [22]:
total_feature_maps = len(h) * feature_map
conv_concat = tf.reshape(tf.concat(3, layers), (-1, total_feature_maps))

In [23]:
dropout_layer = tf.nn.dropout(conv_concat, p)

In [24]:
W = weight_variable((total_feature_maps, n_classes))
b = bias_variable([n_classes])
fc_layer = tf.nn.xw_plus_b(dropout_layer, W, b)

In [25]:
predictions = tf.argmax(fc_layer, 1)

In [31]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(fc_layer, input_y))
train_step = Adadelta(learning_rate=1e-3, rho=0.95, epsilon=1e-6).minimize(cross_entropy)
correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

> *Test:* Test set size (CV means there was no standard train/test split and thus 10-fold CV was used).

In [28]:
def make_train_test_split(dataset, i, k_fold):
    train_x, train_y, test_x, test_y = [], [], [], []
    fold_size = len(dataset) / k_fold
    for j in range(k_fold):
        if j == i:
            test_x.extend(build_matrix(dataset[j*fold_size:(j+1)*fold_size], vocabulary_map, wordvectors, std))
            test_y.extend(y[j*fold_size:(j+1)*fold_size])
        else:
            train_x.extend(build_matrix(dataset[j*fold_size:(j+1)*fold_size], vocabulary_map, wordvectors, std))
            train_y.extend(y[j*fold_size:(j+1)*fold_size])
    train_x.extend(build_matrix(dataset[(j+1)*fold_size:], vocabulary_map, wordvectors, std))
    train_y.extend(y[(j+1)*fold_size:])
    return {
        "train": {
            "data": np.array(train_x),
            "label": np.array(train_y)
        },
        "test": {
            "data": np.array(test_x),
            "label": np.array(test_y)
        }
    }

In [29]:
def get_batch(dataset, batch_size):
    iterations = int(ceil(len(dataset["data"])/batch_size))
    for i in range(iterations):
        batch = {
            "data": dataset["data"][i*batch_size:(i+1)*batch_size],
            "label": dataset["label"][i*batch_size:(i+1)*batch_size]
        }
        yield i, batch

In [None]:
results = []
k_fold = 10
for i in range(k_fold):
    train_loss, train_acc, test_acc = [], [], []
    n_epochs = 20
    temp = make_train_test_split(X, i, k_fold)
    print("Dataset loaded!")
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())
    for epoch in range(1, n_epochs+1):
        print("Training epoch " + str(epoch))
        train_size = ceil(len(temp["train"]["data"])/batch_size)
        total_loss, total_acc = 0, 0
        for n, sample in get_batch(temp["train"], batch_size):
            train_step.run(feed_dict={input_x:sample["data"], input_y:sample["label"]})
            loss = cross_entropy.eval(feed_dict={input_x:sample["data"], input_y:sample["label"]})
            acc = accuracy.eval(feed_dict={input_x:sample["data"], input_y:sample["label"]})
            total_loss += loss
            total_acc += acc
            print("  Epoch: [%d][%d/%d] Loss %.2f Accuracy %.2f" % (epoch, n, train_size, loss, acc))
        print("  Epoch %d: Loss %.2f Accuracy %.2f" % (epoch, total_loss / train_size, total_acc / train_size ))
        train_loss.append(total_loss / train_size)
        train_acc.append(total_acc / train_size)
        total_loss, total_acc = 0, 0
        test_size = ceil(len(temp["test"]["data"])/batch_size)
        for n, sample in get_batch(temp["test"], batch_size):
            acc = accuracy.eval(feed_dict={input_x:sample["data"], input_y:sample["label"]})
            total_acc += acc
            print("  Epoch: [%d][%d/%d] Accuracy %.2f" % (epoch, n, test_size, acc))
        test_acc.append(total_acc / test_size)
        print("Finished Epoch %d Accuracy %.2f" % (epoch, total_acc / test_size))
    print("=============================")
    results.append(max(test_acc) / n_epochs)

Dataset loaded!
Training epoch 1
  Epoch: [1][0/191] Loss 0.43 Accuracy 0.72
  Epoch: [1][1/191] Loss 0.38 Accuracy 0.76
  Epoch: [1][2/191] Loss 0.40 Accuracy 0.76
  Epoch: [1][3/191] Loss 0.23 Accuracy 0.84
  Epoch: [1][4/191] Loss 0.48 Accuracy 0.70
  Epoch: [1][5/191] Loss 0.38 Accuracy 0.82
  Epoch: [1][6/191] Loss 0.44 Accuracy 0.74
  Epoch: [1][7/191] Loss 0.41 Accuracy 0.86
  Epoch: [1][8/191] Loss 0.52 Accuracy 0.78
  Epoch: [1][9/191] Loss 0.35 Accuracy 0.90
  Epoch: [1][10/191] Loss 0.41 Accuracy 0.66
  Epoch: [1][11/191] Loss 0.47 Accuracy 0.80
  Epoch: [1][12/191] Loss 0.60 Accuracy 0.86
  Epoch: [1][13/191] Loss 0.35 Accuracy 0.76
  Epoch: [1][14/191] Loss 0.32 Accuracy 0.88
  Epoch: [1][15/191] Loss 0.43 Accuracy 0.86
  Epoch: [1][16/191] Loss 0.49 Accuracy 0.76
  Epoch: [1][17/191] Loss 0.52 Accuracy 0.78
  Epoch: [1][18/191] Loss 0.46 Accuracy 0.80
  Epoch: [1][19/191] Loss 0.32 Accuracy 0.82
  Epoch: [1][20/191] Loss 0.48 Accuracy 0.82
  Epoch: [1][21/191] Loss 0.36 A