In [1]:
import numpy as np
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(tf.test.is_gpu_available())

Version:  2.2.0
Eager mode:  True
Hub version:  0.8.0
GPU is available
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [2]:
train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], 
                                  batch_size=-1, as_supervised=True)

train_examples, train_labels = tfds.as_numpy(train_data)
test_examples, test_labels = tfds.as_numpy(test_data)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteODW4J9/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteODW4J9/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteODW4J9/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
print("Training entries: {}, test entries: {}".format(len(train_examples), len(test_examples)))
train_examples[:2]

Training entries: 25000, test entries: 25000


array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot 

In [4]:
train_labels[:2]

array([0, 0])

In [5]:
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
import numpy as n

def load_data(examples, targets, num_words, sequence_length, test_size=0.25, oov_token=None):

    reviews, labels = [], []

    for example, label in zip(examples, targets):
      reviews.append(str(example).strip())
      labels.append(str(label).strip())

    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(reviews)
    X = tokenizer.texts_to_sequences(reviews)
    X, y = np.array(X), np.array(labels)
    X = pad_sequences(X, maxlen=sequence_length)

    # convert labels to one-hot encoded
    y = to_categorical(y)


    data = {}
    data["X_train"] = X
    data["y_train"] = y
    data["tokenizer"] = tokenizer
    data["int2label"] =  {0: "negative", 1: "positive"}
    data["label2int"] = {"negative": 0, "positive": 1}

    return data

In [6]:
data = load_data(train_examples, train_labels, 10000, 100)

In [7]:
class TextBiRNN(tf.keras.Model):
    """构建TextBiRNN模型"""
    def __init__(self,
                 word_index,
                 maxlen,
                 vocab_size,
                 embedding_dims,
                 num_class=2,

                 ):
        super(TextBiRNN, self).__init__()
        
        #embedding_matrix = get_embedding_vectors(word_index, embedding_dims)
        self.embed = tf.keras.layers.Embedding(len(word_index) + 1, 
                                               embedding_dims, 
                                               input_length=maxlen)
        self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))
        self.dense = tf.keras.layers.Dense(64, activation='relu')
        self.logits = tf.keras.layers.Dense(num_class)

    def call(self, inputs):
        x = self.embed(inputs)
        x = self.bilstm(x)
        x = self.dense(x)
        x = self.logits(x)
        return x

In [8]:
model = TextBiRNN(data["tokenizer"].word_index,  300, 10000, 300)

model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [9]:
import os
# create these folders if they does not exist
if not os.path.isdir("results"):
    os.mkdir("results")
if not os.path.isdir("logs"):
    os.mkdir("logs")
if not os.path.isdir("data"):
    os.mkdir("data")
# load the data


model_name ="IMDB"

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# 在文件名中包含 epoch (使用 `str.format`)
checkpoint_path = "training_2/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# 创建一个回调，每 5 个 epochs 保存模型的权重
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    period=5)

# using tensorboard on 'logs' folder
tensorboard = TensorBoard(log_dir=os.path.join("logs", model_name))
# start training
history = model.fit(data["X_train"], data["y_train"],
                    batch_size=256,
                    epochs=20,
                    callbacks=[cp_callback],
                    verbose=1)

#tf.saved_model.save(model,'my_saved_model')
#model.save_weights('./checkpoints/my_checkpoint')





Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: saving model to training_2/cp-0005.ckpt
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: saving model to training_2/cp-0010.ckpt
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 00015: saving model to training_2/cp-0015.ckpt
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 00020: saving model to training_2/cp-0020.ckpt


In [10]:
new_model = TextBiRNN(data["tokenizer"].word_index,  300, 10000, 300)

In [11]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

'training_2/cp-0020.ckpt'

In [12]:
new_model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd908dc0780>

In [13]:
test_data = load_data(test_examples, test_labels, 10000, 100)

# 重新评估模型
new_model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
loss, acc = new_model.evaluate(data["X_train"],  data["y_train"], verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

782/782 - 14s - loss: 0.0046 - accuracy: 0.9989
Restored model, accuracy: 99.89%


In [14]:
def get_predictions(text):
    sequence = data["tokenizer"].texts_to_sequences([text])
    # pad the sequences
    sequence = pad_sequences(sequence, maxlen=300)
    # get the prediction
    prediction = new_model.predict(sequence)[0]
    return prediction, data["int2label"][np.argmax(prediction)]

In [15]:
text = "The movie is awesome!"
output_vector, prediction = get_predictions(text)
print("Output vector:", tf.nn.softmax(output_vector, axis=-1))
print("Prediction:", prediction)

Output vector: tf.Tensor([8.984826e-08 9.999999e-01], shape=(2,), dtype=float32)
Prediction: positive


In [20]:
res = new_model.predict(test_data["X_train"])
tf.nn.softmax(res[0], axis=-1)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.0000000e+00, 5.7249266e-10], dtype=float32)>