In [112]:
# references: http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
# reference: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py
import numpy as np
import pickle
import os
from collections import defaultdict
import sys, re
import pandas as pd
import gensim 
from gensim.models import Word2Vec 
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings 
warnings.filterwarnings(action = 'ignore') 

In [100]:
uselessWords = {',','.','a','the','at','in','on'}
def build_data_cv(root, cv=10, clean_string=True):
    posScore = 1
    negScore = 0
    data = {}
    data[posScore] = []
    data[negScore] = []
    for folder in ['train']:
        for subf in ['pos','neg']:
            score = 1 if subf == 'pos' else 0
            path = os.path.join(root, folder, subf)
            files = os.listdir(path)
            print('loading %s' % path)
            for file in files:
                with open(os.path.join(path, file), "r",encoding = 'ISO-8859-1') as f:
                    content = f.read()
                    for j in word_tokenize(content): 
                        if (j.lower() not in uselessWords):
                            data[score].append(j.lower())
#     np.random.shuffle(self.data['train'])
#     self.data["train"] = pd.DataFrame(self.data["train"],
#                                       columns=['text', 'sentiment'])

    # tokenize the sentence into words 
    return data
    
def get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1, k), dtype='float32')            
    W[0] = np.zeros(k, dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W, word_idx_map

def load_bin_vec(vocab):
    model = gensim.models.Word2Vec(vocab[1], min_count = 1,  size = 10000, window = 100) 
    return model

def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)  

def clean_str(string, TREC=False):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip() if TREC else string.strip().lower()

def clean_str_sst(string):
    """
    Tokenization/string cleaning for the SST dataset
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)   
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()

if __name__=="__main__":      
    data_folder = "./../"  
    print ("loading data..."),        
    vocab = build_data_cv(data_folder, cv=10, clean_string=True)
    max_l = np.max(pd.DataFrame(revs)["num_words"])
    print ("data loaded!")
    print ("number of total sentences in word space: " + str(len(revs)))
    print ("vocab size: " + str(len(vocab)))
    print ("max sentence length: " + str(max_l))
    print ("loading word2vec vectors...")
    
#     print "word2vec loaded!"
#     print "num words already in word2vec: " + str(len(w2v))
#     add_unknown_words(w2v, vocab)
#     W, word_idx_map = get_W(w2v)
#     rand_vecs = {}
#     add_unknown_words(rand_vecs, vocab)
#     W2, _ = get_W(rand_vecs)
#     cPickle.dump([revs, W, W2, word_idx_map, vocab], open("mr.p", "wb"))
#     print "dataset created!"
    


loading data...
loading ./../train/pos
loading ./../train/neg
data loaded!
number of total sentences in word space: 2
vocab size: 2
max sentence length: 80
loading word2vec vectors...


In [101]:
model = load_bin_vec(vocab)

In [104]:
print (model1.similarity("love", "movies"))

0.07837918


In [98]:
print (vocab)



In [113]:
import tensorflow as tf
import numpy as np

class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.

    · sequence_length – The length of our sentences. Remember that we padded all our sentences to have the same length.
    · num_classes – Number of classes in the output layer, two in our case (positive and negative).
    · vocab_size – The size of our vocabulary. This is needed to define the size of our embedding layer, which will have shape [vocabulary_size, embedding_size].
    · embedding_size – The dimensionality of our embeddings.
    · filter_sizes – The number of words we want our convolutional filters to cover. 
    We will have num_filters for each size specified here. 
    For example, [3, 4, 5] means that we will have filters that slide over 3, 4 and 5 words respectively, 
    for a total of 3 * num_filters filters.
    · num_filters – The number of filters per filter size (see above).
    """
    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        
        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        # tf.device("/cpu:0") forces an operation to be executed on the CPU. 
        # By default TensorFlow will try to put the operation on the GPU if one is available, 
        # but the embedding implementation doesn’t currently have GPU support and throws an error if placed on the GPU.
        
        # tf.name_scope creates a new Name Scope with the name “embedding”. 
        # The scope adds all operations into a top-level node called “embedding” so that 
        # you get a nice hierarchy when visualizing your network in TensorBoard.
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            
        """
        Now we’re ready to build our convolutional layers followed by max-pooling. 
        Remember that we use filters of different sizes. 
        Because each convolution produces tensors of different shapes we need to iterate through them, 
        create a layer for each of them, and then merge the results into one big feature vector.
        """
        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        
        
         # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")