# Sentiment Analysis with Vanilla RNN

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)




In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

**IMDB Dataset**

In [3]:
df = pd.read_csv("./imdb.csv")

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df = df.sample(10000,random_state = 1)

In [6]:
df.sentiment.value_counts()

sentiment
negative    5044
positive    4956
Name: count, dtype: int64

**Preprocessing and Text to Sequence**

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
num_words = 10000
tokenizer = Tokenizer(num_words=10000,oov_token="<OOV>")

In [10]:
X = df.review.values
y = df.sentiment.values

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=1)

In [13]:
tokenizer.fit_on_texts(X_train)

In [14]:
train_sequences= tokenizer.texts_to_sequences(X_train)
test_sequences= tokenizer.texts_to_sequences(X_test)

In [15]:
vocabulary_size = len(tokenizer.word_index)+1

In [16]:
maxlen = len(max(train_sequences,key=len))
maxlen

1853

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_train_sequences = pad_sequences(train_sequences,maxlen=maxlen)
padded_test_sequences = pad_sequences(test_sequences,maxlen=maxlen)

print(f"Train sequences shape: {padded_train_sequences.shape}")
print(f"Test sequences shape: {padded_test_sequences.shape}")

Train sequences shape: (8000, 1853)
Test sequences shape: (2000, 1853)


**Simple Vanilla RNN Model**

In [24]:
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, InputLayer
from keras.backend import clear_session

clear_session()
model_RNN = Sequential()
model_RNN.add(InputLayer(shape=(maxlen,)))
#maxlen = 1853
model_RNN.add(Embedding(input_dim=vocabulary_size, output_dim=128))
model_RNN.add(SimpleRNN(64, activation='tanh'))
model_RNN.add(Dense(1,activation='sigmoid'))
model_RNN.summary()

In [None]:
model_RNN.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = 'accuracy')

In [None]:
model_RNN.fit(padded_train_sequences, y_train,validation_split = .2, epochs=5, batch_size=256)

In [None]:
model_RNN.evaluate(padded_test_sequences, y_test)