In [1]:
import numpy as np
import os
import time
import string
import sys
import json
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
import random as rn
import keras
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True

Using TensorFlow backend.


In [2]:
#All this for reproducibility
np.random.seed(1)
rn.seed(1)
tf.set_random_seed(1)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
session_conf.gpu_options.allow_growth = True
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
keras.backend.set_session(sess)

In [3]:
# Build the corpus and sequences
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk_stopw = stopwords.words('english')
sequenceLength = 200



def tokenize (text):        #   no punctuation & starts with a letter & between 2-15 characters in length
    tokens = [word.strip(string.punctuation) for word in RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(text)]
    return  [f.lower() for f in tokens if f and f.lower() not in nltk_stopw]

def getMovies():
    X, labels, labelToName  = [], [], { 0 : 'neg', 1: 'pos' }
    for dataset in ['train', 'test']:
        for classIndex, directory in enumerate(['neg', 'pos']):
            dirName = 'E:\\IMDB_Dataset\\aclImdb\\' + dataset + "\\" + directory
            for reviewFile in os.listdir(dirName):
                with open (dirName + '\\' + reviewFile, 'r', encoding="utf8") as f:
                    tokens = tokenize (f.read())
                    if (len(tokens) == 0):
                        continue
                X.append(tokens)
                labels.append(classIndex)
    nTokens = [len(x) for x in X]
    return X, np.array(labels), labelToName, nTokens

X, labels, labelToName, nTokens = getMovies()
print ('Token Summary:min/avg/median/std 85/86/87/88/89/90/95/99/max:',)
print (np.amin(nTokens), np.mean(nTokens),np.median(nTokens),np.std(nTokens),np.percentile(nTokens,85),np.percentile(nTokens,86),np.percentile(nTokens,87),np.percentile(nTokens,88),np.percentile(nTokens,89),np.percentile(nTokens,90),np.percentile(nTokens,95),np.percentile(nTokens,99),np.amax(nTokens))
labelToNameSortedByLabel = sorted(labelToName.items(), key=lambda kv: kv[0]) # List of tuples sorted by the label number [ (0, ''), (1, ''), .. ]
namesInLabelOrder = [item[1] for item in labelToNameSortedByLabel]
numClasses = len(namesInLabelOrder)
print ('X, labels #classes classes {} {} {} {}'.format(len(X), str(labels.shape), numClasses, namesInLabelOrder))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bishn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Token Summary:min/avg/median/std 85/86/87/88/89/90/95/99/max:
3 116.47778 86.0 88.1847205941687 189.0 195.0 203.0 211.0 220.0 230.0 302.0 457.0 1388
X, labels #classes classes 50000 (50000,) 2 ['neg', 'pos']


In [4]:
kTokenizer = keras.preprocessing.text.Tokenizer()
kTokenizer.fit_on_texts(X)
encoded_docs = kTokenizer.texts_to_sequences(X)
Xencoded = keras.preprocessing.sequence.pad_sequences(encoded_docs, maxlen=sequenceLength, padding='post')
print ('Vocab padded_docs {} {}'.format(len(kTokenizer.word_index), str(Xencoded.shape)))


Vocab padded_docs 98089 (50000, 200)


In [5]:
# Test & Train Split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(Xencoded, labels)
train_indices, test_indices = next(sss)
train_x, test_x = Xencoded[train_indices], Xencoded[test_indices]
train_labels = keras.utils.to_categorical(labels[train_indices], len(labelToName))
test_labels = keras.utils.to_categorical(labels[test_indices], len(labelToName))

In [11]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=2, mode='auto', restore_best_weights=False)
model = keras.models.Sequential()
embedding = keras.layers.embeddings.Embedding(input_dim=len(kTokenizer.word_index)+1, output_dim=300, input_length=sequenceLength, trainable=True)
model.add(embedding)
model.add(keras.layers.Conv1D(150, 4, activation='relu', padding='valid'))
model.add(keras.layers.MaxPooling1D(4, padding = 'valid'))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(numClasses, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 200, 300)          29427000  
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 197, 150)          180150    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 49, 150)           0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 7350)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 14702     
Total params: 29,621,852
Trainable params: 29,621,852
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
start_time = time.time()
result = {}
history = model.fit(x=train_x, y=train_labels, epochs=50, batch_size=32, shuffle=True, validation_data = (test_x, test_labels), verbose=2, callbacks=[early_stop])
result['history'] = history.history
result['test_loss'], result['test_accuracy'] = model.evaluate(test_x, test_labels, verbose=1)
predicted = model.predict(test_x, verbose=2)
predicted_labels = predicted.argmax(axis=1)
result['confusion_matrix'] = confusion_matrix(labels[test_indices], predicted_labels).tolist()
result['classification_report'] = classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder, output_dict=True)
print (confusion_matrix(labels[test_indices], predicted_labels))
print (classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder))
elapsed_time = time.time() - start_time
print ('Time Taken:', elapsed_time)
result['elapsed_time'] = elapsed_time

f = open ('cnn.json','w')
out = json.dumps(result, ensure_ascii=True)
f.write(out)
f.close()


Train on 40000 samples, validate on 10000 samples
Epoch 1/50
 - 229s - loss: 0.3277 - acc: 0.8557 - val_loss: 0.2617 - val_acc: 0.8969
Epoch 2/50
 - 224s - loss: 0.0913 - acc: 0.9688 - val_loss: 0.4430 - val_acc: 0.8619
Epoch 3/50
 - 223s - loss: 0.0130 - acc: 0.9967 - val_loss: 0.6000 - val_acc: 0.8703
Epoch 4/50
 - 224s - loss: 0.0085 - acc: 0.9976 - val_loss: 0.9850 - val_acc: 0.8517
Epoch 5/50
 - 224s - loss: 0.0214 - acc: 0.9931 - val_loss: 0.8479 - val_acc: 0.8713
Epoch 6/50
 - 224s - loss: 0.0145 - acc: 0.9959 - val_loss: 0.9176 - val_acc: 0.8708
Epoch 00006: early stopping
[[4275  725]
 [ 567 4433]]
              precision    recall  f1-score   support

         neg     0.8829    0.8550    0.8687      5000
         pos     0.8594    0.8866    0.8728      5000

   micro avg     0.8708    0.8708    0.8708     10000
   macro avg     0.8712    0.8708    0.8708     10000
weighted avg     0.8712    0.8708    0.8708     10000

Time Taken: 1352.1559059619904
