In [4]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sklearn
from sklearn.model_selection import train_test_split

In [5]:
# removing some words and adding some to increase accuracy
stopwords = stopwords.words('english')
newStopWords = ['', ' ', '  ', '   ', '    ', ' s']
stopwords.extend(newStopWords)
stopwords.remove('no')
stopwords.remove('not')
stopwords.remove('very')
stop_words = set(stopwords)
def clean_doc(doc, vocab=None):
    tokens = word_tokenize(doc)
    # keeping only alphabets    
    tokens = [re.sub('[^a-zA-Z]', ' ', word) for word in tokens] 
    # converting to lowercase
    tokens = [word.lower() for word in tokens]
    # removing stopwords
    tokens = [w for w in tokens if not w in stop_words]
    # removing single characters if any
    tokens = [word for word in tokens if len(word) > 1]
    if vocab:
        tokens = [w for w in tokens if w in vocab]
        tokens = ' '.join(tokens)        
    return tokens
def add_doc_to_vocab(text, vocab):
    tokens = clean_doc(text)
    vocab.update(tokens)
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [6]:
reviews=pd.read_csv("ReviewsDataset.csv")
reviews.head()

Unnamed: 0.1,Unnamed: 0,label,sentences
0,0,8,this movie came three now im twenty seven godd...
1,1,7,throw dice take turn jumanji made critics gurn...
2,2,8,you dont live live among remains dead people h...
3,3,5,good enough it doesnt come close disneys film ...
4,4,7,really solid entry series brosnan personally f...


In [7]:
X=reviews["sentences"]
y=reviews["label"]

In [8]:
y = np_utils.to_categorical(y)

In [10]:
# splitting into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)
# removing unnecessary data
del reviews, X, y
# creating a vocabulary of words
vocab = Counter()
len_train = len(X_train)
for i in range(len_train):
    text = X_train.iloc[i]
    add_doc_to_vocab(text , vocab)
print(len(vocab))
# print the 20 most common words
print(vocab.most_common(20))
# removing tokens which occur less than 3 times.
min_occurance = 2
tokens = [k for k,c in vocab.items() if (c >= min_occurance & len(k) > 1)]
# saving the vocabulary for futute use
save_list(tokens, 'vocab.txt')
# loading the saved vocabulary
vocab = load_doc('vocab.txt')
vocab = vocab.split()
vocab = set(vocab)
train_doc = []
for i in range(len_train):
    text = X_train.iloc[i]
    doc = clean_doc(text, vocab)
    train_doc.append(doc)
test_doc = []
len_test = len(X_test)
for i in range(len_test):
    text = X_test.iloc[i]
    doc = clean_doc(text, vocab)
    test_doc.append(doc)

22798
[('film', 1994), ('one', 1276), ('movie', 1266), ('like', 799), ('time', 689), ('story', 683), ('great', 593), ('good', 563), ('well', 546), ('even', 533), ('also', 513), ('first', 507), ('still', 496), ('would', 487), ('much', 482), ('really', 450), ('films', 444), ('cast', 391), ('best', 385), ('see', 362)]


In [11]:
# storing indexes where no tokens are present
index_train = []
for i in range(len(train_doc)):
    if len(train_doc[i]) == 0 :
        index_train.append(i)
    
index_test = []
for i in range(len(test_doc)):
    if len(test_doc[i]) == 0 :
        index_test.append(i)
# dropping the unnecessary data
train_doc = np.delete(train_doc, index_train, 0)
test_doc = np.delete(test_doc, index_test, 0)
y_train = np.delete(y_train, index_train, 0)
y_test = np.delete(y_test, index_test, 0)


In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_doc)
X_train = tokenizer.texts_to_matrix(train_doc, mode='binary')
X_test = tokenizer.texts_to_matrix(test_doc, mode='binary')
n_words = X_test.shape[1]

In [16]:
print(X_train.shape)

(835, 6106)


In [14]:
# LSTM Model
model = Sequential()
model.add(Bidirectional(LSTM(100, activation='relu'), input_shape=(None,n_words)))
model.add(Dropout(0.2))
model.add(Dense(units=50, input_dim=100, activation='relu'))
model.add(Dense(9, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fitting the LSTM model
model.fit(X_train.reshape((-1, 1, n_words)), y_train, epochs=20, batch_size=100)
# finding test loss and test accuracy
loss_rnn, acc_rnn = model.evaluate(X_test.reshape((-1, 1, n_words)), y_test, verbose=0)
# saving model weights
model.model.save('rnn.h5')
# loading saved weights
model_rnn = load_model('rnn.h5')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


