In [1]:
from keras.preprocessing import sequence
from keras import models, layers, optimizers, datasets, utils, losses
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn import metrics
import plotly.figure_factory as ff
from keras import backend

In [3]:
vocabulary_size = 10000
maxlen = 40
batch_size = 25

(x1_train, y_train), (x1_test, y_test) = datasets.imdb.load_data(num_words=vocabulary_size)
x_train = sequence.pad_sequences(x1_train, maxlen)
x_test = sequence.pad_sequences(x1_test, maxlen)

# LSTM

In [4]:
inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
h=layers.LSTM(128, dropout=0.8, recurrent_dropout=0.8)(e)
outputs=layers.Dense(1, activation='sigmoid')(h)
lst_model = models.Model(inputs, outputs)

lst_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

lst_model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

score, acc = lst_model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test accuracy:', acc)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: 0.8064000010490417


# GRU

In [22]:
inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
h=layers.GRU(128, dropout=0.8, recurrent_dropout=0.8)(e)
outputs=layers.Dense(1, activation='sigmoid')(h)
model = models.Model(inputs, outputs)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test accuracy:', acc)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: 0.8141999840736389


# Dense Layer

In [30]:
from keras.models import Sequential
model=Sequential()

inputs = layers.Input(shape=(maxlen,))
model.add(inputs)
model.add(layers.Embedding(vocabulary_size, 128))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='tanh'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test accuracy:', acc)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 40, 128)           1280000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 5120)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 5121      
Total params: 1,285,121
Trainable params: 1,285,121
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: 0.7815600037574768


# Evaluation Using Naive Bayes approach

## Loading Downloaded Imdb dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
movie_df=pd.read_csv('/home/arjun/Downloads/IMDB Dataset.csv',delimiter=',')


## Vectorize words for feeding into model

In [3]:
tf_vector = TfidfVectorizer()
review_vector = tf_vector.fit_transform(movie_df['review'])
X_train,X_test,Y_train,Y_test = train_test_split(review_vector,movie_df['sentiment'],test_size=0.40)

## Accuracy for different types of Naive Bayes Model

In [134]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,ComplementNB
naive_bayes=MultinomialNB()
naive_bayes.fit(X_train, Y_train)
accuracy = metrics.accuracy_score(naive_bayes.predict(X_test), Y_test)
print(f'Accuracy for Multinominal Naive Bayes Approach = {accuracy}')

naive_bayes=BernoulliNB()
naive_bayes.fit(X_train, Y_train)
accuracy = metrics.accuracy_score(naive_bayes.predict(X_test), Y_test)
print(f'Accuracy for Bernoulli Naive Bayes Approach = {accuracy}')


naive_bayes=ComplementNB()
naive_bayes.fit(X_train, Y_train)
accuracy = metrics.accuracy_score(naive_bayes.predict(X_test), Y_test)
print(f'Accuracy for Complement Naive Bayes Approach = {accuracy}')

Accuracy for Multinominal Naive Bayes Approach = 0.8595
Accuracy for Bernoulli Naive Bayes Approach = 0.84845
Accuracy for Complement Naive Bayes Approach = 0.85945


### 1. Accuracies for GRU,LSTM,Dense remained same around 80 percent. However, Naive Bayes model improves the accuracy percent to 84 percent,by transforming texts to feature vectors. Different types of Naive bayes model is evaluated and produces 84 percent.

# Get Index values of Imdb dataset

In [46]:
data_id= datasets.imdb.get_word_index()
words={value:word for word,value in data_id.items()}

#  PairWise Distance in Embedding space

In [136]:
## Choosing LST model Embedding layer 
embedded_layer_output = backend.function([lst_model.layers[0].input],[lst_model.layers[1].output])
layer_output = embedded_layer_output([x_train])[0]

words_list= [words[value] for value in x_train[22]]

store_indexs=[2,11,12]


print(f'Words list from row 22 = {words_list}')
print(f'{"="*110}')
print(f'Choose words from vocabulary {[words_list[i] for i in store_indexs ] }')
print(f'{"="*110}')

print('Pair wise Euclidean Score for selected words')
score_matrix=[]
for i in store_indexs:
    each_row=[]
    for j in store_indexs:
        score = euclidean_distances([layer_output[12][i]], [layer_output[12][j]])[0][0]
        each_row.append(score)
    score_matrix.append(each_row)


fig = ff.create_annotated_heatmap(score_matrix, x=[words_list[i] for i in store_indexs],
                y=[words_list[i] for i in store_indexs],colorscale='Viridis')
fig.show()
    
    

Words list from row 22 = ['have', 'out', 'best', 'comic', 'friends', 'of', 'here', 'i', 'i', 'not', 'these', 'jokes', 'wonderful', 'as', 'by', 'but', 'they', "there's", 'is', 'four', 'leads', 'always', 'plays', 'that', 'they', "there's", 'zero', 'except', 'to', "can't", 'is', 'birds', 'company', 'br', 'glimpse', 'some', 'br', 'that', 'give', 'gets']
Choose words from vocabulary ['best', 'jokes', 'wonderful']
Pair wise Euclidean Score for selected words
