In [2]:
import nltk  #for nlp
from nltk.tokenize import word_tokenize # tokenizer : convert sentence to bag of words
from nltk.stem import WordNetLemmatizer # stemmer : consider similar looking words as same

import numpy as np
import random
import pickle
from collections import Counter

In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
def create_lexicon(pos,neg):
    lexicon = []
    for file in [pos,neg]:
        with open(file,'r') as fp:
            contents = fp.readlines()
            for line in contents:
                all_words = word_tokenize(line)
                lexicon += list(all_words)
    lexicon = [ lemmatizer.lemmatize(i.lower()) for i in lexicon ]
    w_counts = Counter(lexicon)
    
    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    return l2

def sample_handling(sample,lexicon,classification):
    featureset = []
    with open(sample,'r') as fp:
        contents = fp.readlines()
        for k in contents:
            current_words = word_tokenize(k.lower())
            current_words = [ lemmatizer.lemmatize(i) for i in current_words ]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value =  lexicon.index(word.lower())
                    features[index_value] = 1
            features = list(features)
            featureset.append([features,classification])
    return featureset

def create_feature_sets_and_labels(pos,neg,test_size=0.1):
    lexicon = create_lexicon(pos,neg)
    features = []
    features += sample_handling(pos,lexicon,[1,0])
    features += sample_handling(neg,lexicon,[0,1])
    random.shuffle(features)
    
    features = np.array(features)
    testing_size = int(test_size*len(features))
    
    train_x = list(features[:,0][:-testing_size]) #all rows and their 0th column
    train_y = list(features[:,1][:-testing_size]) #all rows and their 1st column
    
    test_x = list(features[:,0][-testing_size:]) #all rows and their 0th column
    test_y = list(features[:,1][-testing_size:])
    
    return (train_x,train_y,test_x,test_y)
    

In [5]:
train_x,train_y,test_x,test_y  = create_feature_sets_and_labels('Datasets/pos.txt','Datasets/neg.txt')
with open("sentiment_analysis_data.pickle",'wb') as fp:
    pickle.dump([train_x,train_y,test_x,test_y],fp)

In [12]:
import tensorflow as tf
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500 
n_classes = 2
batch_size = 100
x = tf.placeholder('float',[None,len(train_x[0])]) #if shape is not defined, we can feed in any shape
y = tf.placeholder('float',[None,2]) 

In [13]:
def neural_network_model(data):
    hidden_layer_1 = {'weights':tf.Variable(tf.random_normal([len(train_x[0]),n_nodes_hl1])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
    hidden_layer_2 = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1,n_nodes_hl2])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
    hidden_layer_3 = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2,n_nodes_hl3])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}
    
    output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3,n_classes])),
                    'biases':tf.Variable(tf.random_normal([n_classes]))}
    
    l1 = tf.add(tf.matmul(data,hidden_layer_1['weights']),hidden_layer_1['biases'])
    l1 = tf.nn.relu(l1)
    l2 = tf.add(tf.matmul(l1,hidden_layer_2['weights']),hidden_layer_2['biases'])
    l2 = tf.nn.relu(l2)
    l3 = tf.add(tf.matmul(l2,hidden_layer_3['weights']),hidden_layer_3['biases'])
    l3 = tf.nn.relu(l3)
    
    output  = tf.add(tf.matmul(l3,output_layer['weights']),output_layer['biases'])
    return output #tf.nn.softmax(output)

In [14]:
def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    hm_epochs = 10
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer()) 
        for epoch in range(hm_epochs):
            epoch_loss = 0
            i=0
            while i < len(train_x):
                start = i
                end = i + batch_size
                batch_x  = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                _, c = sess.run([optimizer,cost],feed_dict={x:batch_x,y:batch_y})
                epoch_loss+=c
                i+=batch_size
            print('Epoch {}/{} completed, loss : {}'.format(epoch+1,hm_epochs,epoch_loss))
            
        correct = tf.equal(tf.argmax(prediction,1),tf.argmax(y,1))
        accuracy = tf.reduce_mean(tf.cast(correct,'float'))
        print('Accuracy : {}'.format(accuracy.eval({x:test_x, y:test_y})))

In [15]:
train_neural_network(x)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

Epoch 1/10 completed, loss : 232740.87231445312
Epoch 2/10 completed, loss : 107774.2587890625
Epoch 3/10 completed, loss : 67445.84518432617
Epoch 4/10 completed, loss : 42985.80920410156
Epoch 5/10 completed, loss : 27652.678970336914
Epoch 6/10 completed, loss : 18309.538276672363
Epoch 7/10 completed, loss : 15639.921699523926
Epoch 8/10 completed, loss : 14780.87549495697
Epoch 9/10 completed, loss : 12673.709559440613
Epoch 10/10 completed, loss : 8830.900351762772
Accuracy : 0.5675421953201294
