## Sentiment analysis with RNN

In [1]:
'''
load modules
'''
import numpy as np
import tensorflow as tf

In [2]:
'''
load dataset
'''
with open('reviews.txt') as used_file:
    reviews = used_file.read() #read each words, not as per line
    
with open('labels.txt') as used_file:
    labels = used_file.read()

In [3]:
reviews[ : 105] #as you can see the words is incomplete since it read() not readlines()
#we will fix this later

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  suc'

In [4]:
labels[ : 5] #same as this one

'posit'

In [5]:
'''
- preprocess data
- remove punctuation
- put as words list by using set() to remove redundat
'''
from string import punctuation
all_text = ''.join([char for char in reviews if char not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
#create a set of words
words = all_text.split()

In [6]:
#this is our list of words
words[ : 100]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such',
 'as',
 'teachers',
 'my',
 'years',
 'in',
 'the',
 'teaching',
 'profession',
 'lead',
 'me',
 'to',
 'believe',
 'that',
 'bromwell',
 'high',
 's',
 'satire',
 'is',
 'much',
 'closer',
 'to',
 'reality',
 'than',
 'is',
 'teachers',
 'the',
 'scramble',
 'to',
 'survive',
 'financially',
 'the',
 'insightful',
 'students',
 'who',
 'can',
 'see',
 'right',
 'through',
 'their',
 'pathetic',
 'teachers',
 'pomp',
 'the',
 'pettiness',
 'of',
 'the',
 'whole',
 'situation',
 'all',
 'remind',
 'me',
 'of',
 'the',
 'schools',
 'i',
 'knew',
 'and',
 'their',
 'students',
 'when',
 'i',
 'saw',
 'the',
 'episode',
 'in',
 'which',
 'a',
 'student',
 'repeatedly',
 'tried',
 'to',
 'burn',
 'down',
 'the',
 'school',
 'i',
 'immediately',
 'recalled',
 'at',
 'high']

In [7]:
'''
encoding the words
'''
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key = counts.get, reverse = True)

#vocab2int
vocab2int = {word: ii for ii, word in enumerate(vocab, 1)} #start from 1

int2vocab = {ii: word for ii, word in enumerate(vocab, 1)}

'''
- encode
- it already a matrix of len(each review) column x num of reviews
- it will be transform into batches
'''
reviews_int = []
for review in reviews:
    reviews_int.append([vocab2int[word] for word in review.split()])
    


In [8]:
print(vocab2int)



In [9]:
print(int2vocab)



In [10]:
print(vocab2int['great'])
print(int2vocab[85])
print(len(vocab2int))

85
great
74072


In [11]:
print(words[ : 10])
print(reviews_int[0][ : 10]) #first review, 10 words only

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']
[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1]


In [12]:
'''
encoding labels
'''
labels = labels.split('\n')
labels = np.array([1 if label == 'positive' else 0 for label in labels])


In [13]:
labels[100: 120] #looks, it is 1,0,1,0,1,0

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

In [14]:
labels[121: 140]

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

In [15]:
'''
- preprocessing
- we only use non zero length review
'''
non_zero_idx = [ii for ii, review in enumerate(reviews_int) if len(review) != 0]
len(non_zero_idx)

25000

In [16]:
'''
we only use reviews and labels from non zero index
'''
reviews_int = [reviews_int[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [17]:
'''
- making batches
- we use seq_len = 200
- if less 200 padleft with 0's
- if more 200 use only 200
'''
seq_len = 200
#set initial 200 zeros column x len(reviews_int) rows
features = np.zeros((len(reviews_int), seq_len), dtype = int)
#fill like stack, from right that sift left until all words, so the last word still in the last column
for i, row in enumerate(reviews_int):
    #fill from right but shift left manner
    features[i, -len(row) : ] = np.array(row)[ : seq_len]

In [18]:
features[ : 2, : 200]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 21025,   308,     6,
            3,  1050,   207,     8,  2138,    32,     1,   171,    57,
           15,    49,    81,  5785,    44,   382,   110,   140,    15,
         5194,    60,   154,     9,     1,  4975,  5852,   475,    71,
            5,   260,    12, 21025,   308,    13,  1978,     6,    74,
         2395,     5,   613,    73,     6,  5194,     1, 24103,     5,
         1983, 10166,     1,  5786,  1499,    36,    51,    66,   204,
          145,    67,  1199,  5194, 19869,     1, 37442,     4,     1,
      

In [19]:
print(labels[ : 10])

[1 0 1 0 1 0 1 0 1 0]


In [20]:
'''
train, validation, testing dataset
'''
split_frac = 0.8
split_idx = int(len(features) * split_frac)
train_x, val_x = features[ : split_idx], features[split_idx : ]
train_y, val_y = labels[ : split_idx], labels[split_idx : ]

test_idx = int(len(val_x) * 0.5)
val_x, test_x = val_x[ : test_idx], val_x[test_idx : ]
val_y, test_y = val_y[ : test_idx], val_y[test_idx : ]


In [21]:
'''
hyperparameters
'''
lstm_size = 256
lstm_layers = 2
batch_size = 500
learning_rate = 0.001

In [22]:
'''
-build graph
- placeholders for inputs, targets, keep_prob
'''
n_words = len(vocab2int) + 1
graph = tf.Graph()

with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name = 'inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name = 'labels')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    

In [23]:
'''
- embed layers, since we have 74,000 words in words list
- we need word2vec as our better word representation
'''
embed_size = 150
with graph.as_default():
    embed_weights = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed_model = tf.nn.embedding_lookup(embed_weights, inputs_)

In [24]:
'''
- lstm cell, we modified it to 2 layers
- we need a function to create LSTM cell for each layer
- first build a cell, then stack all cells
- simpler than in character-wise
'''
with graph.as_default():
    def get_a_cell(lstm_size, keep_prob):
        lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
        drop = tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob = keep_prob)
        
        return drop
    
    with tf.name_scope('lstm'):
        cell = tf.nn.rnn_cell.MultiRNNCell(
        [get_a_cell(lstm_size, keep_prob) for _ in range(lstm_layers)])
        
    initial_state = cell.zero_state(batch_size, tf.float32)

In [25]:
'''
- RNN forward pass
- used BEFORE the final layers for output, for sentiment classifier
'''
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed_model, 
                                             initial_state = initial_state)
    

In [26]:
'''
- RNN output
- We connect RNN forward pass output to fully connected NN
- as classifier function
'''
with graph.as_default():
    #get predictions
    '''
    - unlike in character-wise where the target is one shift away
    - here we only get the last of the sequence as outout
    '''
    predictions = tf.contrib.layers.fully_connected(outputs[ :, -1], 1, activation_fn = tf.sigmoid)
    #get cost
    cost = tf.losses.mean_squared_error(labels_, predictions)
    #optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [27]:
'''
- RNN validation
'''
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    

In [28]:
'''
- get batching
- we already batched the data, this is for iteration
'''
def get_batch(x, y, batch_size = 100):
    n_batches = len(x) // batch_size
    x, y = x[ : n_batches * batch_size], y[ : n_batches * batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ ii : ii + batch_size], y[ii : ii + batch_size]
              

In [31]:
'''
training
'''
epochs = 2
with graph.as_default():
    saver = tf.train.Saver()
    
#session
with tf.Session(graph = graph) as sess:
    tf.global_variables_initializer().run()
    iteration = 1
    
    for epoch in range(epochs):
        #get initial state
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batch(train_x, train_y, batch_size), 1):
            '''
            - we must feed label as array [label], not a scalar label
            - y[:, None] returns [label] 
            '''
            feed = {inputs_: x, 
                    labels_: y[:, None], 
                    keep_prob: 0.5, 
                    initial_state : state}
            
            #get loss and final_state
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict = feed)
            
            if iteration % 2 == 0:
                print('epoch: {}/{} \t iteration: {} \t loss: {}'.
                      format(epoch, epochs, iteration, loss))
                
            if iteration % 4 == 0:
                val_acc = []
                #set initial validation state
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                
                for x, y in get_batch(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state : val_state}
                    
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict = feed)
                    val_acc.append(batch_acc)
                    
                print('val_acc: {}'.format(np.mean(val_acc)))
                
            iteration += 1
        
    #saver.save(sess, '/checkpoints/sentiment.ckpt')
        
    #test
    test_acc = []
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batch(test_x, test_y, batch_size), 1):
        feed = {inputs_: x, 
                labels_: y[:, None], 
                keep_prob: 1,
                initial_state : test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict = feed)
        test_acc.append(batch_acc)
    print('test accuracy: {}'.format(np.mean(test_acc)))
    
    sess.close()

epoch: 0/2 	 iteration: 2 	 loss: 0.24957358837127686
epoch: 0/2 	 iteration: 4 	 loss: 0.24817050993442535
val_acc: 0.5715999603271484
epoch: 0/2 	 iteration: 6 	 loss: 0.2388259321451187
epoch: 0/2 	 iteration: 8 	 loss: 0.24801388382911682
val_acc: 0.6067999601364136
epoch: 0/2 	 iteration: 10 	 loss: 0.24111437797546387
epoch: 0/2 	 iteration: 12 	 loss: 0.24691176414489746
val_acc: 0.6551999449729919
epoch: 0/2 	 iteration: 14 	 loss: 0.22365133464336395
epoch: 0/2 	 iteration: 16 	 loss: 0.22713665664196014
val_acc: 0.5956000089645386
epoch: 0/2 	 iteration: 18 	 loss: 0.23996587097644806
epoch: 0/2 	 iteration: 20 	 loss: 0.22761456668376923
val_acc: 0.6416000127792358
epoch: 0/2 	 iteration: 22 	 loss: 0.23932114243507385
epoch: 0/2 	 iteration: 24 	 loss: 0.22048872709274292
val_acc: 0.6959999799728394
epoch: 0/2 	 iteration: 26 	 loss: 0.2353629767894745
epoch: 0/2 	 iteration: 28 	 loss: 0.26422378420829773
val_acc: 0.6564000248908997
epoch: 0/2 	 iteration: 30 	 loss: 0.220