In [1]:
import numpy as np
import os
import tensorflow as tf
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())

Version:  2.0.0
Eager mode:  True


In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(tf.test.is_gpu_available())

True


In [3]:
train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], 
                                  batch_size=-1, as_supervised=True)

train_examples, train_labels = tfds.as_numpy(train_data)
test_examples, test_labels = tfds.as_numpy(test_data)

In [4]:
print("Training entries: {}, test entries: {}".format(len(train_examples), len(test_examples)))
train_examples[:2]

Training entries: 25000, test entries: 25000


array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot 

In [5]:
train_labels[:2]

array([0, 0], dtype=int64)

In [6]:
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard


def load_data(examples, targets, num_words, sequence_length, test_size=0.20, oov_token=None):

    reviews, labels = [], []

    for example, label in zip(examples, targets):
      reviews.append(str(example).strip())
      labels.append(str(label).strip())

    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(reviews)
    X = tokenizer.texts_to_sequences(reviews)
    X, y = np.array(X), np.array(labels)
    X = pad_sequences(X, maxlen=sequence_length)

    # convert labels to one-hot encoded
    y = to_categorical(y)
    
    print ("begin loding data...")
    data = {}
    data["X_train"] = X
    data["y_train"] = y
    data["tokenizer"] = tokenizer
    data["int2label"] =  {0: "negative", 1: "positive"}
    data["label2int"] = {"negative": 0, "positive": 1}

    return data

In [7]:
class TextCNNAttention(tf.keras.Model):
    def __init__(self,
                 word_index,
                 embedding_dims,
                 maxlen,
                 class_num=2,
                 weights=None,
                 weights_trainable=False,
                 kernel_sizes=[3, 4, 5],
                 filter_size=128,
                 name=None,
                 **kwargs):
      
        super(TextCNNAttention, self).__init__(name=name, **kwargs)

        self.vocab_size = len(word_index) + 1
        self.max_len = max_len
        self.kernel_sizes = kernel_sizes

        if weights != None:
            weights = np.array(weights)
            self.embedding = tf.keras.layers.Embedding(self.vocab_size, 
                                  embedding_dims,input_length=self.max_len, 
                                  weights=[weights],
                                  trainable=weights_trainable)
        else:
            self.embedding = tf.keras.layers.Embedding(self.vocab_size, 
                                  embedding_dims,
                                  input_length=self.max_len)

        self.convs = []
        self.max_poolings = []
        for i, k in enumerate(kernel_sizes):
            self.convs.append(tf.keras.layers.Conv1D(filter_size, k, activation="relu"))
            self.max_poolings.append(tf.keras.layers.GlobalAvgPool1D())
        self.dense = tf.keras.layers.Dense(class_num, activation='softmax')
        self.bn = tf.keras.layers.BatchNormalization()
        self.attention = tf.keras.layers.Attention()

    def call(self, inputs, training=True):
        q_embed = self.embedding(inputs)
        v_embed = self.embedding(inputs)
        convs = []

        for i, k in enumerate(self.kernel_sizes):
            q = self.convs[i](q_embed)
            v = self.convs[i](v_embed)

            q = self.max_poolings[i](q)
            v = self.max_poolings[i](v)
            q_v = self.attention([q, v])

            convs.append(q_v)

        out = tf.keras.layers.concatenate(convs)

        out = self.bn(out, training=training)

        out = self.dense(out)

        return out


In [8]:
embedding_dims = 300
max_len= 100
filter_size = 2


data = load_data(train_examples[:], train_labels, 10000, 100)

model = TextCNNAttention(data["tokenizer"].word_index, embedding_dims, max_len, filter_size)



if not os.path.isdir("logs"):
    os.mkdir("logs")

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

begin loding data...


In [9]:
if not os.path.isdir("logs"):
    os.mkdir("logs")
tensorboard = TensorBoard(log_dir=os.path.join("logs", "IMDB"))

history = model.fit(data["X_train"], data["y_train"],
                    batch_size=256,
                    epochs=10,
                    validation_split = 0.1,
                    callbacks=[tensorboard])

model.save_weights("IMDB.h5", overwrite=True)
model.summary()

Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "text_cnn_attention"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  25961700  
_________________________________________________________________
conv1d (Conv1D)              multiple                  115328    
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  153728    
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  192128    
_________________________________________________________________
global_average_pooling1d (Gl multiple                  0         
_________________________________________________________________
global_average_pooli

In [11]:
def get_predictions(text):
    sequence = data["tokenizer"].texts_to_sequences([text])
    # pad the sequences
    sequence = pad_sequences(sequence, maxlen=100)
    # get the prediction
    prediction = model.predict(sequence)[0]
    return prediction, data["int2label"][np.argmax(prediction)]

text = "The movie is awesome!"
output_vector, prediction = get_predictions(text)
print("Output vector:", output_vector)
print("Prediction:", prediction)

Output vector: [0. 1.]
Prediction: positive


In [14]:
test_data = load_data(train_examples, train_labels, 10000, 100)

new_model = TextCNNAttention(test_data["tokenizer"].word_index, embedding_dims, max_len, filter_size)
new_model.load_weights("IMDB.h5", by_name=True)

begin loding data...


In [15]:
text = "The movie is awesome!"
sequence = test_data["tokenizer"].texts_to_sequences([text])
sequence = pad_sequences(sequence, maxlen=100)
prediction = new_model.predict(sequence)[0]

print(prediction)
print(test_data["int2label"][np.argmax(prediction)])

[0.48567432 0.5143257 ]
positive
