Importing Packages

In [5]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Setting the current working directory
import os; os.chdir('drive/My Drive/NLP')

In [7]:
# Import packages
import pandas as pd, numpy as np
import tensorflow as tf
assert tf.__version__ >= '2.0'

from itertools import islice

# Keras
from keras.layers import Dense, Embedding, LSTM, Dropout, MaxPooling1D, Conv1D
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.preprocessing import sequence
from keras.datasets import imdb

from keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Suppress warnings
import warnings; warnings.filterwarnings('ignore')

random_state = 42
np.random.seed(random_state)
tf.random.set_seed(random_state)

Loading Dataset - Train & Test Split

In [8]:
vocab_size = 10000
maxlen = 300
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = vocab_size)

x_train = pad_sequences(x_train, maxlen = maxlen, padding = 'pre')
x_test =  pad_sequences(x_test, maxlen = maxlen, padding = 'pre')

X = np.concatenate((x_train, x_test), axis = 0)
y = np.concatenate((y_train, y_test), axis = 0)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state, shuffle = True)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2, random_state = random_state, shuffle = True)

print('---'*20, f'\nNumber of rows in training dataset: {x_train.shape[0]}')
print(f'Number of columns in training dataset: {x_train.shape[1]}')
print(f'Number of unique words in training dataset: {len(np.unique(np.hstack(x_train)))}')


print('---'*20, f'\nNumber of rows in validation dataset: {x_valid.shape[0]}')
print(f'Number of columns in validation dataset: {x_valid.shape[1]}')
print(f'Number of unique words in validation dataset: {len(np.unique(np.hstack(x_valid)))}')


print('---'*20, f'\nNumber of rows in test dataset: {x_test.shape[0]}')
print(f'Number of columns in test dataset: {x_test.shape[1]}')
print(f'Number of unique words in test dataset: {len(np.unique(np.hstack(x_test)))}')


print('---'*20, f'\nUnique Categories: {np.unique(y_train), np.unique(y_valid), np.unique(y_test)}')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
------------------------------------------------------------ 
Number of rows in training dataset: 32000
Number of columns in training dataset: 300
Number of unique words in training dataset: 9999
------------------------------------------------------------ 
Number of rows in validation dataset: 8000
Number of columns in validation dataset: 300
Number of unique words in validation dataset: 9984
------------------------------------------------------------ 
Number of rows in test dataset: 10000
Number of columns in test dataset: 300
Number of unique words in test dataset: 9995
------------------------------------------------------------ 
Unique Categories: (array([0, 1]), array([0, 1]), array([0, 1]))


Get word index and create a key-value pair for word and word id

In [9]:
def decode_review(x, y):
  w2i = imdb.get_word_index()                                
  w2i = {k:(v + 3) for k, v in w2i.items()}
  w2i['<PAD>'] = 0
  w2i['<START>'] = 1
  w2i['<UNK>'] = 2
  i2w = {i: w for w, i in w2i.items()}

  ws = (' '.join(i2w[i] for i in x))
  print(f'Review: {ws}')
  print(f'Actual Sentiment: {y}')
  return w2i, i2w

w2i, i2w = decode_review(x_train[0], y_train[0])

# get first 50 key, value pairs from id to word dictionary
print('---'*30, '\n', list(islice(i2w.items(), 0, 50)))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
Review: <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <START> the only possible way to enjoy this flick is to bang your head against the wall allow some internal <UNK> of the brain let a bunch of your brain cells die and once you are officially mentally retarded perhaps then you might enjoy this film br br the only saving grace was the story between <UNK> and stephanie govinda was excellent in the role of the cab driver and so was the brit girl perhaps if they would have created the whole movie on their <UNK> in

Build Keras Embedding Layer Model


In [10]:
# Model
model = Sequential()
model.add(Embedding(vocab_size, 256, input_length = maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(256, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(Conv1D(128, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(MaxPooling1D(pool_size = 2))
model.add(Conv1D(64, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(MaxPooling1D(pool_size = 2))
model.add(LSTM(75))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

# Adding callbacks
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 0)  
mc = ModelCheckpoint('imdb_model.h5', monitor = 'val_loss', mode = 'min', save_best_only = True, verbose = 1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 256)          2560000   
                                                                 
 dropout (Dropout)           (None, 300, 256)          0         
                                                                 
 conv1d (Conv1D)             (None, 300, 256)          327936    
                                                                 
 conv1d_1 (Conv1D)           (None, 300, 128)          163968    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 150, 128)         0         
 )                                                               
                                                                 
 conv1d_2 (Conv1D)           (None, 150, 64)           41024     
                                                        

In [11]:
# Fit the model
model.fit(x_train, y_train, validation_data = (x_valid, y_valid), epochs = 3, batch_size = 64, verbose = True, callbacks = [es, mc])

# Evaluate the model
scores = model.evaluate(x_test, y_test, batch_size = 64)
print('Test accuracy: %.2f%%' % (scores[1]*100))

Epoch 1/3
Epoch 1: val_loss improved from inf to 0.24686, saving model to imdb_model.h5
Epoch 2/3
Epoch 2: val_loss did not improve from 0.24686
Epoch 2: early stopping
Test accuracy: 89.49%


In [14]:
#y_pred = model.predict_classes(x_test)
predict_x=model.predict(x_test) 
y_pred=np.argmax(predict_x,axis=1)
print(f'Classification Report:\n{classification_report(y_pred, y_test)}')

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.49      0.66     10000
           1       0.00      0.00      0.00         0

    accuracy                           0.49     10000
   macro avg       0.50      0.25      0.33     10000
weighted avg       1.00      0.49      0.66     10000



Retrive output of each layer in keras for a given single test sample from the trained model

In [15]:
sample_x_test = x_test[np.random.randint(10000)]
for layer in model.layers:

    model_layer = Model(inputs = model.input, outputs = model.get_layer(layer.name).output)
    output = model_layer.predict(sample_x_test.reshape(1,-1))
    print('\n','--'*20, layer.name, 'layer', '--'*20, '\n')
    print(output)


 ---------------------------------------- embedding layer ---------------------------------------- 

[[[ 0.01240082  0.00111831 -0.00751773 ...  0.00435726  0.02900069
    0.01388618]
  [ 0.01240082  0.00111831 -0.00751773 ...  0.00435726  0.02900069
    0.01388618]
  [ 0.01240082  0.00111831 -0.00751773 ...  0.00435726  0.02900069
    0.01388618]
  ...
  [-0.02351479  0.01813366 -0.03133444 ... -0.07914362  0.00348496
   -0.0343854 ]
  [-0.01062981  0.01777026 -0.02406921 ...  0.04433554 -0.01352686
    0.04584379]
  [ 0.01886249 -0.06760372  0.00992214 ... -0.02097903 -0.02103261
   -0.00969492]]]

 ---------------------------------------- dropout layer ---------------------------------------- 

[[[ 0.01240082  0.00111831 -0.00751773 ...  0.00435726  0.02900069
    0.01388618]
  [ 0.01240082  0.00111831 -0.00751773 ...  0.00435726  0.02900069
    0.01388618]
  [ 0.01240082  0.00111831 -0.00751773 ...  0.00435726  0.02900069
    0.01388618]
  ...
  [-0.02351479  0.01813366 -0.0313344

In [22]:
decode_review(x_test[10], y_test[10])
#print(y_pred)
print(f'Predicted sentiment: {y_pred[10]}')

Review: <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

Conclusion
* Sentiment classification task on the IMDB dataset, on test dataset,
  * Accuracy: > 90%
  * F1-score: > 50%
  * Loss of 0.4