# Sentiment Analysis with an RNN

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
#https://www.kaggle.com/ngyptr/python-nltk-sentiment-analysis/data
data = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

In [3]:
data[:8]

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
5,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive
6,RT @warriorwoman91: I liked her and was happy ...,Negative
7,Going on #MSNBC Live with @ThomasARoberts arou...,Neutral


## Data preprocessing

In [5]:
from string import punctuation

def pd_series_to_text(series, split=True):
    review_list = []
    for line in series:
        
        line_text = [e.lower() for e in line.split() if len(e) >= 3]
        line_text = [word for word in line_text
            if 'http' not in word
            and not word.startswith('@')
            and not word.startswith('#')
            and word != 'RT']
        line_text = ''.join([c for c in str(line_text) if c not in punctuation])
        review_list.append(line_text)
    if not split:
        reviews = ' '.join([c for c in review_list if len(c) > 2])
    else:
        reviews = review_list
    return reviews

def pd_series_to_sentiment(series):
    senti_list = []
    for line in series:
        if line=='Positive':
            senti = 2
        #elif line=='Neutral':
        #    senti = 1
        else:
            senti = 0
        senti_list.append(senti)
    return senti_list

In [6]:
reviews = pd_series_to_text(data['text'],True)
words = pd_series_to_text(data['text'],False).split()
sentiment = pd_series_to_sentiment(data['sentiment'])

In [7]:
sentiment[:5]

[0, 2, 0, 2, 2]

In [8]:
reviews[:5]

['how did everyone feel about the climate change question last night exactly',
 'didnt catch the full last night here are some scotts best lines seconds',
 'mention tamir rice and the was held cleveland wow',
 'that carly fiorina trending hours after her debate above any the men justcompleted says shes',
 'delivered the highest ratings the history presidential debates']

In [9]:
print(len(reviews),len(sentiment))

13871 13871


## Encoding the words and removing empty lines

In [10]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [11]:
reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

In [12]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 23


In [13]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

In [14]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
sentiment = np.asarray([sentiment[ii] for ii in non_zero_idx])

### create featurevectors

In [15]:
seq_len = 23
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

## Training, Validation, Test

In [16]:
split_frac = 0.8
split_idx = int(len(features) * split_frac)
train_x = features[:split_idx]
train_y = sentiment[:split_idx]
val_x = features[split_idx:]
val_y = sentiment[split_idx:]

In [17]:
test_idx = int(len(val_x) * 0.5)
test_x =  val_x[test_idx:]
val_x = val_x[:test_idx]
test_y =  val_y[test_idx:]
val_y = val_y[:test_idx]

In [18]:
print("\t\t\tFeature Shapes:")
print("Features: \t\t{}".format(features.shape), 
      "\nTrain set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Features: 		(13871, 23) 
Train set: 		(11096, 23) 
Validation set: 	(1387, 23) 
Test set: 		(1388, 23)


## Build the graph

* lstm_size: Number of units in the hidden layers in the LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
* lstm_layers: Number of LSTM layers in the network. I'd start with 1, then add more if I'm underfitting.
* batch_size: The number of reviews to feed the network in one training pass. Typically this should be set as high as you can go without running out of memory.
* learning_rate: Learning rate


In [19]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.005

In [20]:
n_words = len(vocab_to_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

### Embeddings

In [21]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

### LSTM

In [22]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

### RNN

In [23]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

### Output

In [24]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [25]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [26]:
# batches
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

# Training

In [27]:
epochs = 20

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/20 Iteration: 5 Train loss: 0.570
Epoch: 0/20 Iteration: 10 Train loss: 0.498
Epoch: 0/20 Iteration: 15 Train loss: 0.536
Epoch: 0/20 Iteration: 20 Train loss: 0.445
Epoch: 1/20 Iteration: 25 Train loss: 0.404
Val acc: 0.719
Epoch: 1/20 Iteration: 30 Train loss: 0.431
Epoch: 1/20 Iteration: 35 Train loss: 0.445
Epoch: 1/20 Iteration: 40 Train loss: 0.256
Epoch: 2/20 Iteration: 45 Train loss: 0.424
Epoch: 2/20 Iteration: 50 Train loss: 0.316
Val acc: 0.738
Epoch: 2/20 Iteration: 55 Train loss: 0.306
Epoch: 2/20 Iteration: 60 Train loss: 0.316
Epoch: 2/20 Iteration: 65 Train loss: 0.251
Epoch: 3/20 Iteration: 70 Train loss: 0.275
Epoch: 3/20 Iteration: 75 Train loss: 0.293
Val acc: 0.715
Epoch: 3/20 Iteration: 80 Train loss: 0.242
Epoch: 3/20 Iteration: 85 Train loss: 0.321
Epoch: 4/20 Iteration: 90 Train loss: 0.274
Epoch: 4/20 Iteration: 95 Train loss: 0.221
Epoch: 4/20 Iteration: 100 Train loss: 0.251
Val acc: 0.692
Epoch: 4/20 Iteration: 105 Train loss: 0.398
Epoch: 4/20 Ite

In [28]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints\sentiment.ckpt
Test accuracy: 0.746
