In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, Flatten, Bidirectional, LSTM, GRU, Dropout, GlobalAveragePooling1D, Conv1D
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

(ds_train, ds_val), ds_info = tfds.load("imdb_reviews", split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True)
sample = next(iter(ds_train))
print("Review:", sample[0].numpy())
print("Label:", sample[1].numpy())

Review: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
Label: 0


In [None]:
train_data, train_labels = [], []
for review, label in ds_train:
  train_data.append(str(review.numpy()))
  train_labels.append(label.numpy())

val_data, val_labels = [], []
for review, label in ds_val:
  val_data.append(str(review.numpy()))
  val_labels.append(label.numpy())

print("Number of training samples:", len(train_data))
print("Number of validation samples:", len(val_data))

BATCH_SIZE = 64
BUFFER_SIZE = 1024
VOCAB_SIZE = 1000
EMBED_DIM = 64
MAX_SEQ_LEN = 256

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data)

def create_tfds(tokenizer, X, y, padding=False):
  if padding:
    X = pad_sequences(tokenizer.texts_to_sequences(X), maxlen=MAX_SEQ_LEN, padding="post")
  return tf.data.Dataset.from_tensor_slices((X, y)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

ds_train = create_tfds(tokenizer, train_data, train_labels, padding=True)
ds_val = create_tfds(tokenizer, val_data, val_labels, padding=True)

Number of training samples: 25000
Number of validation samples: 25000


In [None]:
model3 = Sequential()
model3.add(Embedding(input_dim=VOCAB_SIZE, input_length=MAX_SEQ_LEN, output_dim=EMBED_DIM))
model3.add(LSTM(128, activation='tanh',
                recurrent_activation='sigmoid',
                recurrent_dropout=0,
                unroll=False,
                use_bias=True, 
                return_sequences=True))
model3.add(LSTM(256, activation='tanh',
                recurrent_activation='sigmoid',
                recurrent_dropout=0,
                unroll=False,
                use_bias=True, 
                return_sequences=True))
model3.add(Dropout(0.2))
model3.add(LSTM(128, activation='tanh',
                recurrent_activation='sigmoid',
                recurrent_dropout=0,
                unroll=False,
                use_bias=True))
model3.add(Dropout(0.1))
model3.add(Dense(1, activation="sigmoid"))
model3.summary()


model3.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["acc"])

callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=5)

model3.fit(ds_train, epochs=30, validation_data=ds_val, callbacks=[callbacks])

model3.evaluate(ds_val)

# VALIDATION LOSS SHOULD BE LESS THAN 0.3

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 256, 64)           64000     
                                                                 
 lstm_18 (LSTM)              (None, 256, 128)          98816     
                                                                 
 lstm_19 (LSTM)              (None, 256, 256)          394240    
                                                                 
 dropout_8 (Dropout)         (None, 256, 256)          0         
                                                                 
 lstm_20 (LSTM)              (None, 128)               197120    
                                                                 
 dropout_9 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 1)                

[0.37432342767715454, 0.8437600135803223]

In [None]:
model3.save('IMDBreviews.h5')