# Sentiment Analysis with RNN

In [1]:
# Import and data read

import numpy as np
import tensorflow as tf

with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()

In [2]:
reviews[:200]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  '

In [3]:
# Removing punctuation

from string import punctuation

all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

all_text[:200]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s '

In [4]:
# Word examples

words[:5]

['bromwell', 'high', 'is', 'a', 'cartoon']

In [5]:
# Word2int dictionary
vocab_to_int = {k:v for v, k in enumerate(set(words), start = 1)}

# Reviews to integers
reviews_ints = [[vocab_to_int[word] for word in review.split(' ') if word in vocab_to_int.keys()] for review in reviews]

# Labels to integers
labels = [1 if label == 'positive' else 0 for label in labels.split('\n')]

In [6]:
from collections import Counter

review_lens = Counter([len(x) for x in reviews_ints])
print('Zero-length reviews: {}'.format(review_lens[0]))
print('Maximum review length: {}'.format(max(review_lens)))
print('Total reviews {}'.format(len(reviews_ints)))

Zero-length reviews: 1
Maximum review length: 2514
Total reviews 25001


In [7]:
# Remove a review with the length of 0
zero_length_review = [i for i, j in enumerate(reviews_ints) if len(j) == 0]
for i in zero_length_review:
  reviews_ints.pop(i)
  labels.pop(i)

review_lens = Counter([len(x) for x in reviews_ints])
print('Zero-length reviews: {}'.format(review_lens[0]))
print('Maximum review length: {}'.format(max(review_lens)))
print('Total reviews {}'.format(len(reviews_ints)))

Zero-length reviews: 0
Maximum review length: 2514
Total reviews 25000


In [8]:
# Sequence padding (pre)

seq_len = 200

features = []
for review in reviews_ints:
  if len(review) > 200:
    features.append(review[:seq_len])
  else:
    while len(review) < 200:
      review.insert(0, 0)
    features.append(review)
    
print(features[0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61516, 32733, 63787, 18959, 32011, 22814, 58432, 44263, 27680, 62052, 53090, 20527, 22312, 62128, 10565, 69134, 15875, 32429, 52584, 11734, 22312, 22800, 23742, 22868, 32614, 62052, 42717, 60036, 22828, 23730, 52130, 49763, 55941, 61516, 32733, 43525, 6525, 63787, 4961, 51392, 52130, 13151, 35390, 63787, 22800, 62052, 4549, 52130, 27652, 65618, 62052, 31236, 25057, 73279, 57556, 36276, 46225, 46439, 29076, 60381, 22800, 22162, 62052, 53378, 5164, 62052, 10059, 67397, 31153, 4327, 23730, 5164, 62052, 46324, 8054, 39339, 14255, 29076, 25057, 18745, 8054, 25641, 62052, 27146, 32614, 6883, 18959, 37370, 73563, 44386, 52130, 11130, 63175, 62052, 32429, 8054, 31580, 62946, 27680, 32733, 18959, 40814, 27909, 47514, 8054, 47205, 49654, 52130, 11814, 4181, 5164, 29702, 22800, 37370, 1035, 52130, 61516, 32733, 8054, 5

In [9]:
import random

# Train-test split
random.shuffle(reviews_ints)
split_frac = 0.8


train_x, val_x = np.array(features[:int(len(reviews_ints)*split_frac)]), np.array(features[int(len(reviews_ints)*(1 - split_frac)):])
train_y, val_y = np.array(labels[:int(len(labels)*split_frac)]), np.array(labels[int(len(labels)*(1 - split_frac)):])

# val_x, test_x = np.array(val_x[:int(len(val_x)*0.5)]), np.array(val_x[int(len(val_x)*0.5):])
# val_y, test_y = np.array(val_y[:int(len(val_y)*0.5)]), np.array(val_y[int(len(val_x)*0.5):])

print('\t\t\tFeature Shapes:')
print(f'Train set: \t\t{train_x.shape}',
       f'\nValidation set: \t{val_x.shape}')

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(10000, 200) 
Test set: 		(10001, 200)
