In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras import layers

import nltk
nltk.download('punkt')

# Import the data set
from keras.datasets import imdb
# split the data set into training and testing target-data
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data()

# get all the words already tokenized(remember that the words are unique, each having an ID) -> returns a dictionary
index = imdb.get_word_index()
# reverse key-value pair in dictionary
reverse_index = dict([(value, key) for (key, value) in index.items()])



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

decoded_sentences = []
# extract from training data
for x in range(len(data)):
    decoded_sentences.append(" ".join([reverse_index.get(i - 3, "") for i in data[x]]))
    
    
#take less inputs
decoded_sentences = decoded_sentences[:1000]
targets = targets[:1000]    

In [9]:
# define data
from sklearn.model_selection import train_test_split

reviews_train, reviews_test, y_train, y_test = train_test_split(decoded_sentences, targets, test_size=0.20)

In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=74702)

#Use tokenisation only on the training data!
tokenizer.fit_on_texts(reviews_train)

X_train = tokenizer.texts_to_sequences(reviews_train)
X_test = tokenizer.texts_to_sequences(reviews_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

#print(reviews_train[0])
#print(X_train[0])

In [11]:
# find review with max len
review_max_len = 0
for x in X_train:
    if len(x) > review_max_len:
        review_max_len = len(x)

        
print(review_max_len)

1850


In [12]:
from keras.preprocessing.sequence import pad_sequences

maxlen = review_max_len

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [13]:
def cnn_text_classifier():
    embedding_dim = 50

    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)) #https://keras.io/layers/embeddings/ 
    model.add(layers.Conv1D(128, 5, activation='relu')) #https://keras.io/layers/convolutional/
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    return model
    

In [14]:
import tensorflow as tf
model = cnn_text_classifier()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
training = model.fit(X_train, y_train, epochs=20, verbose=2, validation_split=.1, batch_size=5,  callbacks=[callback])
#details about the model: https://keras.io/models/model/ 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1850, 50)          1977600   
_________________________________________________________________
conv1d (Conv1D)              (None, 1846, 128)         32128     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 2,011,029
Trainable params: 2,011,029
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
720/720 - 26s - loss: 0.6017 - accuracy: 0.6639 - val_loss: 0.4211 - val_accuracy: 0.8300
Epoch 2/20


In [15]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

scores = model.evaluate(X_test, y_test, verbose=0)
print("CNN Accuracy: %.2f%%" % (scores[1]*100))

Training Accuracy: 0.9843
Testing Accuracy:  0.8480
CNN Accuracy: 84.80%
