In [0]:
import tensorflow as tf
import tensorflow.keras
import numpy as np
import os
import pandas as pd

In [0]:
with open('reviews.txt','r') as f:
  reviews=f.read()
with open('labels.txt','r') as f:
  labels=f.read()

In [0]:
reviews[:2000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

#Data Preprocessing


In [0]:
from string import punctuation
all_text=''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [0]:
all_text[:2000]

In [0]:
words[:100]

##Encoding the words

In [0]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

##Encoding the labels

In [0]:
labels = labels.split('\n')
labels = np.array([1 if each == 'positive' else 0 for each in labels])

##Removing Zero length reviews and their labels

In [0]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 2514


In [0]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

25000

In [0]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

##Padding reviews

In [0]:
seq_len = 200
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [0]:
features[:10,:100]

##Training,Validation,Test Split


In [0]:
import numpy as np

In [0]:
split_frac=0.8
split_idx=int(len(features)*split_frac)
train_idx=np.random.choice(len(features),split_idx,replace=0)
# val_idx=[x for x in range(25000) if x not in train_idx]

In [0]:
x_train,x_val=[features[x] for x in train_idx],[features[x] for x in range(25000) if x not in train_idx]
y_train,y_val=[labels[x] for x in train_idx],[labels[x] for x in range(25000) if x not in train_idx]

In [0]:
len(x_val)

5000

In [0]:
test_idx=int(len(x_val)*0.5)
x_test,y_test=x_val[:test_idx],y_val[:test_idx]
x_val,y_val=x_val[test_idx:],y_val[test_idx:]

In [0]:
x_train,y_train,x_val,y_val,x_test,y_test=np.array(x_train),np.array(y_train),np.array(x_val),np.array(y_val),np.array(x_test),np.array(y_test)
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(x_train.shape), 
      "\nValidation set: \t{}".format(x_val.shape),
      "\nTest set: \t\t{}".format(x_test.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


#Building Graph

##Hyperparameters

In [0]:
lstm_size=256
lstm_layers=1
batch_size=500
learning_rate=1e-3

##Defining inputs, labels and placeholders

In [0]:
n_words=len(vocab_to_int)+1
graph=tf.Graph()
with graph.as_default():
  inputs_=tf.placeholder(tf.int32,[None,None],name='inputs')
  labels_=tf.placeholder(tf.int32,[None,None],name='labels')
  keep_prob=tf.placeholder(tf.float32,name='keep_prob')

##Embedding

In [0]:
embed_size=300
with graph.as_default():
  embedding=tf.Variable(tf.random_uniform((n_words,embed_size),+1,-1))
  embed=tf.nn.embedding_lookup(embedding,inputs_)

##LSTM cell

In [0]:
with graph.as_default():
  lstm=tf.nn.rnn_cell.LSTMCell(lstm_size)
  dropout=tf.nn.rnn_cell.DropoutWrapper(lstm,output_keep_prob=keep_prob)
  lstm_layer=tf.nn.rnn_cell.MultiRNNCell([dropout]*lstm_layers)
  initial_state=lstm_layer.zero_state(batch_size,tf.float32)

##RNN forward pass

In [0]:
with graph.as_default():
  outputs,final_state=tf.nn.dynamic_rnn(lstm_layer,embed,initial_state=initial_state)

##Output

In [0]:
with graph.as_default():
  predictions=tf.contrib.layers.fully_connected(outputs[:,-1],1,activation_fn=tf.sigmoid)
  cost=tf.losses.mean_squared_error(labels_,predictions)
  optimizer=tf.train.AdamOptimizer(learning_rate).minimize(cost)

W0706 12:44:13.757879 140686651438976 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0706 12:44:14.058392 140686651438976 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/losses/losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


##Validation Accuracy

In [0]:
with graph.as_default():
  correct_pred=tf.equal(tf.cast(tf.round(predictions),tf.int32),labels_)
  accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))

##Batching

In [0]:
def get_batches(x,y,batch_size=100):
  n_batches=len(x)//batch_size
  x,y=x[:n_batches*batch_size],y[:n_batches*batch_size]
  for i in range(0,len(x),batch_size):
    yield x[i:i+batch_size],y[i:i+batch_size]

#Training

In [0]:
epochs=10
with graph.as_default():
  saver=tf.train.Saver()
  
with tf.Session(graph=graph) as sess:
  sess.run(tf.global_variables_initializer())
  
  iteration=1
  for e in range(epochs):
    state=sess.run(initial_state)
    for i,(x,y) in enumerate(get_batches(x_train,y_train,batch_size),1):
      feed={inputs_:x,
           labels_:y[:,None],
           keep_prob:0.5,
           initial_state:state}
      loss,state,_ = sess.run([cost,final_state,optimizer],feed_dict=feed)
      if iteration%5==0:
        print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))
      if iteration%25==0:
        val_acc=[]
        val_state=sess.run(lstm_layer.zero_state(batch_size,tf.float32))
        for x,y in get_batches(x_val,y_val,batch_size):
          feed={inputs_:x,
               labels_:y[:,None],
               keep_prob:1,
               initial_state:val_state}
          batch_acc,val_state=sess.run([accuracy,final_state],feed_dict=feed)
          val_acc.append(batch_acc)
        print("Val acc: {:.3f}".format(np.mean(val_acc)))
      iteration +=1
  saver.save(sess, "checkpoints/sentiment.ckpt")

#Testing

In [0]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(lstm_layer.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(x_test, y_test, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

Test accuracy: 0.838
