<a href="https://colab.research.google.com/github/Bhanukoya/Natural-Language-Processing/blob/master/NLP_IMBD_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sentimental analysis on IMDB movie reviews

In [111]:
import tensorflow as tf
import pandas as pd
import numpy as np


In [112]:
tf.__version__

'2.2.0'

In [113]:
import tensorflow_datasets as tfds

In [114]:
imbd, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)

In [115]:
train_data = imbd['train']
test_data = imbd['test']

In [116]:
train_data

<DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>

In [117]:
train_sentences = []
train_labels = []

test_sentences = []
test_labels = []


for i,j in train_data:
  train_sentences.append(i.numpy().decode('utf8'))
  train_labels.append(j.numpy())

for i, j in test_data:
  test_sentences.append(i.numpy().decode('utf8'))
  test_labels.append(j.numpy())
  


In [118]:
train_labels[0:3]

[0, 0, 0]

In [119]:
test_labels[0:3]

[1, 1, 0]

In [120]:
train_labels_final = np.array(train_labels)

test_labels_final = np.array(test_labels)

In [121]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [122]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [123]:
token = Tokenizer(num_words=10000, oov_token='drr')

In [124]:
token.fit_on_texts(train_sentences)

In [125]:
word_index = token.word_index

In [126]:
sequences = token.texts_to_sequences(train_sentences)

In [127]:
padded = pad_sequences(sequences, maxlen=120, truncating='post' )

In [128]:
print(sequences[3])

[12, 7, 2, 241, 5, 20, 16, 4, 8776, 2707, 2653, 52, 2, 358, 5, 2, 180, 68, 138, 1401, 17, 92, 203, 968, 15, 23, 1, 81, 4, 192, 3109, 3037, 3, 1, 16, 4, 376, 5, 632, 387, 352, 37, 6355, 3, 5559, 1928, 15, 208, 8596, 3433, 2, 112, 365, 48, 24, 55, 1, 6, 1657, 55, 1818, 4312, 41, 4, 2263, 3, 1908, 1, 141, 160, 780, 111, 31, 92, 116, 4, 221, 20, 9, 173, 279, 3, 29, 13, 1012, 2, 2814, 10, 1985]


In [152]:
print(train_sentences[3])

This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.


In [130]:
reverse_word_index = dict([(value, key) for key, value in word_index.items()])

In [155]:
reverse_word_index

{1: 'drr',
 2: 'the',
 3: 'and',
 4: 'a',
 5: 'of',
 6: 'to',
 7: 'is',
 8: 'br',
 9: 'in',
 10: 'it',
 11: 'i',
 12: 'this',
 13: 'that',
 14: 'was',
 15: 'as',
 16: 'for',
 17: 'with',
 18: 'movie',
 19: 'but',
 20: 'film',
 21: 'on',
 22: 'not',
 23: 'you',
 24: 'are',
 25: 'his',
 26: 'have',
 27: 'he',
 28: 'be',
 29: 'one',
 30: 'all',
 31: 'at',
 32: 'by',
 33: 'an',
 34: 'they',
 35: 'who',
 36: 'so',
 37: 'from',
 38: 'like',
 39: 'her',
 40: 'or',
 41: 'just',
 42: 'about',
 43: "it's",
 44: 'out',
 45: 'if',
 46: 'has',
 47: 'some',
 48: 'there',
 49: 'what',
 50: 'good',
 51: 'more',
 52: 'when',
 53: 'very',
 54: 'up',
 55: 'no',
 56: 'time',
 57: 'she',
 58: 'even',
 59: 'my',
 60: 'would',
 61: 'which',
 62: 'only',
 63: 'story',
 64: 'really',
 65: 'see',
 66: 'their',
 67: 'had',
 68: 'can',
 69: 'were',
 70: 'me',
 71: 'well',
 72: 'than',
 73: 'we',
 74: 'much',
 75: 'been',
 76: 'bad',
 77: 'get',
 78: 'will',
 79: 'do',
 80: 'also',
 81: 'into',
 82: 'people',
 83:

In [132]:
decoded_sentence = []

for i in sequences[3]:
  decoded_sentence.append(reverse_word_index[i])

In [133]:
decoded_sentence = ' '.join(decoded_sentence)

In [134]:
decoded_sentence

'this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you drr into a big arm chair and drr for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no drr to cross no dangerous waters just a warm and witty drr through new york life at its best a family film in every sense and one that deserves the praise it received'

In [135]:
test_sequences = token.texts_to_sequences(test_sentences)

In [136]:
test_padded = pad_sequences(test_sequences, maxlen=120)

In [137]:
from keras.models import Sequential

In [138]:
from keras.layers import Dense,Flatten, AveragePooling1D, Embedding

In [139]:
model = Sequential()

In [140]:
model.add(Embedding( 10000, 16 , input_length=120))

In [141]:
model.add(Flatten())

In [142]:
model.add(Dense(6, activation='relu'))

In [143]:
model.add(Dense(1, activation='sigmoid'))

In [144]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [145]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [146]:
model.fit(padded, train_labels_final, epochs=10 , validation_data=(test_padded, test_labels_final))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f0f70a265c0>

In [147]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


### one for positive review, Zero for negative review

In [148]:
sentence = "I really think this is worst. honest."
sequence = token.texts_to_sequences([sentence])
print(sequence)

[[11, 64, 102, 12, 7, 247, 1200]]


In [149]:
example = pad_sequences(sequence, maxlen = 120, truncating='post')

In [150]:
example

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   11,   64,  102,   12,    7,  247, 1200]],
      dtype=int32)

In [151]:
model.predict(example)

array([[0.02672744]], dtype=float32)