<a href="https://colab.research.google.com/github/ChandrashekarCYoga/NLP/blob/master/Natural_Language_Processing_Project_Chandrashekar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMDB Movie Reviews Sentiment Classification

Firstly, let's select TensorFlow version 2.x in colab

In [None]:
%tensorflow_version 2.x
import tensorflow
tensorflow.__version__

'2.2.0'

## Loading the dataset from keras

In [None]:
from keras.datasets import imdb

Using TensorFlow backend.


### Prepare input and output data

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 # max vocab size as per the problem statement
maxlen = 300 

## **Train and Test Split**

In [None]:
# Load dataset as a list of ints

# vocab_size is no.of words to consider from the dataset, ordering based on frequency.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

# Make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [None]:
x_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    1,  194, 1153,  194, 8255,   78,  228,    5,    6, 1463,
       4369, 5012,  134,   26,    4,  715,    8,  118, 1634,   14,  394,
         20,   13,  119,  954,  189,  102,    5,  207,  110, 3103,   21,
         14,   69,  188,    8,   30,   23,    7,   

In [None]:
x_train.shape

(25000, 300)

In [None]:
x_train[1].shape

(300,)

In [None]:
y_train[1]

0

In [None]:
import numpy as np
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

[[    0     1]
 [12500 12500]]


In [None]:
import numpy as np
unique_elements, counts_elements = np.unique(y_test, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

[[    0     1]
 [12500 12500]]


## **Word Index Building**
### Get the word index and then Create a key-value pair for word and word_id 
Convert predict sequence back to words in keras

In [None]:
word_index = imdb.get_word_index()

In [None]:
word_map = dict(map(reversed, word_index.items()))

In [None]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

n_items = take(20, word_map.items())
n_items

[(34701, 'fawn'),
 (52006, 'tsukino'),
 (52007, 'nunnery'),
 (16816, 'sonja'),
 (63951, 'vani'),
 (1408, 'woods'),
 (16115, 'spiders'),
 (2345, 'hanging'),
 (2289, 'woody'),
 (52008, 'trawling'),
 (52009, "hold's"),
 (11307, 'comically'),
 (40830, 'localized'),
 (30568, 'disobeying'),
 (52010, "'royale"),
 (40831, "harpo's"),
 (52011, 'canet'),
 (19313, 'aileen'),
 (52012, 'acurately'),
 (52013, "diplomat's")]

In [None]:
# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [word_map.get(word) for word in list_of_indices]
    return(words)

print(sequence_to_text(x_train[0]))

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their', 'becomes', 'reaching', 'had', 'journalist', 'of', 'lot', 'from', 'anyone', 'to', 'have', 'after', 'out', 'atmosphere', 'never', 'more', 'room', 'and', 'it', 'so', 'heart', 'shows', 'to', 'years', 'of', 'every', 'never', 'going', 'and', 'help', 'moments', 'or', 'of', 'every', 'chest', 'visual', 'movie', 'except', 'her', 'was', 'several', 'of', 'enough', 'more', 'with', 'is', 'now', 'current', 'film', 'as', 'you', 'of'

## **Build a Sequential Model using Keras for the Sentiment Classification**

We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [None]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import LSTM

In [None]:
batch_size = 32
embedding_dims = 128
nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 10

In [None]:
# Creat the model 

### create the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, trainable=True, input_length=maxlen))
model.add(LSTM(units=64, dropout=0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 128)          1280000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,331,521
Trainable params: 1,331,521
Non-trainable params: 0
_________________________________________________________________


In [None]:
### Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=nb_epoch, batch_size=batch_size, verbose=1)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fdf9d9b8828>

## **Model Accuracy**

In [None]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.12%


In [None]:
# Model prediction
y_pred = model.predict(x_test)

In [None]:
print(y_pred)

[[4.2238235e-03]
 [9.9999350e-01]
 [3.1647015e-01]
 ...
 [6.8342686e-04]
 [6.7372918e-02]
 [9.9977994e-01]]


In [None]:
y_pred = np.round(y_pred, 0)

In [None]:
y_pred = y_pred.ravel()
y_pred.shape

(25000,)

In [None]:
y_pred = y_pred.astype('int64')

In [None]:
y_test.ravel
y_test

array([0, 1, 1, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import classification_report
target_names = ['Sentiment_Positive', 'Sentiment_Negative']
print(classification_report(y_test, y_pred, target_names=target_names))

                    precision    recall  f1-score   support

Sentiment_Positive       0.85      0.88      0.86     12500
Sentiment_Negative       0.87      0.85      0.86     12500

          accuracy                           0.86     25000
         macro avg       0.86      0.86      0.86     25000
      weighted avg       0.86      0.86      0.86     25000



In [None]:
sequence_to_text(x_test[0])

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
sequence_to_text(x_test[1])

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'the',
 'as',
 'you',
 "world's",
 'is',
 'quite',
 'br',
 'mankind',
 'most',
 'that',
 'quest',
 'are',
 'chase',
 'to',
 'being',
 'quickly',
 'of',
 'little',
 'it',
 'time',
 'hell',
 'to',
 'plot',
 'br',
 'of',
 'something',
 'long',
 'put',
 'are',
 'of',
 'every',
 'place',
 'this',
 'consequence',
 'and',
 'of',
 'interplay',
 'storytelling',
 'being',
 'nasty',
 'not',
 'of',
 'you',
 'warren',
 'in',
 'is',
 'failed',
 'club',
 'i',
 'i',
 'of',
 'films',
 'pay',
 'so',
 'sequences',
 'and',
 'film',
 'okay',
 'uses',
 'to',
 'received',
 'and',
 'if',
 'time',
 'done',
 'for',
 'room',
 'sugar',
 'viewer',
 'as',
 'cartoon',
 'of',
 'gives',
 'to',
 'forgettable',
 'br',
 'be',
 'because',
 'many'

## **Retrive the output of each layer in keras for a given single test sample from the trained model you built**

In [None]:
from keras import backend as K

# input placeholder
ip = model.input

# all output layers
outputs = [layer.output for layer in model.layers]

print(outputs)
functors = [K.function([ip, K.learning_phase()], [out]) for out in outputs]    # evaluation functions


# Testing
test = x_test[0][np.newaxis, ...]
layer_outs = [func([test, 1]) for func in functors]

print (layer_outs)




[<tf.Tensor 'embedding_1/embedding_lookup/Identity_1:0' shape=(None, 300, 128) dtype=float32>, <tf.Tensor 'lstm_1/strided_slice_18:0' shape=(None, 64) dtype=float32>, <tf.Tensor 'dense_1/Relu:0' shape=(None, 32) dtype=float32>, <tf.Tensor 'dense_2/Sigmoid:0' shape=(None, 1) dtype=float32>]
[[array([[[-0.08002027,  0.10763775, -0.08623564, ...,  0.04880287,
          0.00209026,  0.02427271],
        [-0.08002027,  0.10763775, -0.08623564, ...,  0.04880287,
          0.00209026,  0.02427271],
        [-0.08002027,  0.10763775, -0.08623564, ...,  0.04880287,
          0.00209026,  0.02427271],
        ...,
        [ 0.01053442, -0.0711675 ,  0.11472849, ...,  0.03883199,
         -0.09521551,  0.01024593],
        [-0.04420754, -0.00559954,  0.08075345, ...,  0.01223716,
          0.08970833, -0.01524084],
        [-0.02831219,  0.05909333, -0.16075805, ...,  0.10717237,
         -0.0083857 , -0.13487203]]], dtype=float32)], [array([[ 2.8007114e-01,  4.9990335e-01,  1.6693561e-01, -4.198