In [70]:
import tensorflow as tf

In [71]:
from gensim.models import Word2Vec
import numpy as np
# Imports
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
model = Word2Vec.load("../w2v_sentiment/models/300features_40minwords_10context").wv

In [73]:
train = pd.read_csv("../w2v_sentiment/data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("../w2v_sentiment/data/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("../w2v_sentiment/data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [74]:
def sentence_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.decode('utf-8').strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(sentence_to_wordlist(raw_sentence, remove_stopwords))
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [75]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [76]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        if counter%1000 == 0:
            print ("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[int(counter)] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [25]:
num_features = 300
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(sentence_to_wordlist(review, remove_stopwords=True))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [27]:
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

print ("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(sentence_to_wordlist(review, remove_stopwords=True))

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Creating average feature vecs for test reviews




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [77]:
from sklearn.preprocessing import Imputer
testDataVecs = Imputer().fit_transform(testDataVecs)

In [118]:
# Parameters
learning_rate = 0.001
training_epochs = 30
batch_size = 100
display_step = 1

# Network Parameters
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_input = 300
n_classes = 2

# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

In [119]:
# Create model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

In [120]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
pred = multilayer_perceptron(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.global_variables_initializer()

In [121]:
train_sentiments_map = []
for val in train["sentiment"]:
    if val == 0:
        train_sentiments_map.append([1,0])
    if val == 1:
        train_sentiments_map.append([0,1])

In [122]:
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(trainDataVecs)/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = trainDataVecs[(i)*100 : (i+1)*100], train_sentiments_map[(i)*100 : (i+1)*100]
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                          y: batch_y})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print ("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    print ("Optimization Finished!")
    correct_prediction = sess.run(tf.argmax(pred, 1), feed_dict={x: testDataVecs})

Epoch: 0001 cost= 3.571472003
Epoch: 0002 cost= 1.667232335
Epoch: 0003 cost= 1.211381133
Epoch: 0004 cost= 0.995420994
Epoch: 0005 cost= 0.845631971
Epoch: 0006 cost= 0.730666003
Epoch: 0007 cost= 0.661279000
Epoch: 0008 cost= 0.607787565
Epoch: 0009 cost= 0.560145144
Epoch: 0010 cost= 0.518454472
Epoch: 0011 cost= 0.490293534
Epoch: 0012 cost= 0.464295433
Epoch: 0013 cost= 0.445859225
Epoch: 0014 cost= 0.432970222
Epoch: 0015 cost= 0.411908697
Epoch: 0016 cost= 0.396544895
Epoch: 0017 cost= 0.382996098
Epoch: 0018 cost= 0.371860764
Epoch: 0019 cost= 0.363027715
Epoch: 0020 cost= 0.354362277
Epoch: 0021 cost= 0.346157914
Epoch: 0022 cost= 0.340652742
Epoch: 0023 cost= 0.332018624
Epoch: 0024 cost= 0.325159088
Epoch: 0025 cost= 0.319370486
Epoch: 0026 cost= 0.312960944
Epoch: 0027 cost= 0.308667784
Epoch: 0028 cost= 0.305597676
Epoch: 0029 cost= 0.314861824
Epoch: 0030 cost= 0.325246774
Optimization Finished!


In [123]:
len(correct_prediction)

25000

In [124]:
output = pd.DataFrame(data={"id":test["id"], "sentiment":correct_prediction})
output.to_csv("Word2Vec_AverageVectors_2layered_neuralnet.csv", index=False, quoting=3)