In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.datasets import imdb


In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

In [None]:
data = np.concatenate((X_train, X_test), axis=0)
label = np.concatenate((y_train, y_test), axis=0)


In [None]:
X_train.shape
X_test.shape

In [None]:
y_train.shape
y_test.shape

In [None]:
print("Review is ",X_train[0]) # series of no converted word to vocabulory associated with index
print("Review is ",y_train[0])

In [None]:
vocab=imdb.get_word_index() # Retrieve the word index file mapping words to indices
print(vocab)

In [None]:
y_train

In [None]:
y_test

In [None]:
def vectorize(sequences, dimension = 10000): # We will vectorize every review and fill it with zeros so that it contains exactly 10,000 numbers.
# Create an all-zero matrix of shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
        return results

In [None]:
test_x = data[:10000]
test_y = label[:10000]
train_x = data[10000:]
train_y = label[10000:]

In [None]:
test_x.shape

In [None]:
train_x.shape

In [None]:
train_y.shape

In [None]:
test_y.shape

In [None]:
print("Categories:", np.unique(label))
print("Number of unique words:", len(np.unique(np.hstack(data))))

In [None]:
length = [len(i) for i in data]
print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))

In [None]:
print("Label:", label[0])
Label: 1
print("Label:", label[1])
Label: 0
print(data[0])

In [None]:
index = imdb.get_word_index() # word to index
reverse_index = dict([(value, key) for (key, value) in index.items()]) # id to word
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in data[0]] )
print(decoded)

In [None]:
data = vectorize(data)
label = np.array(label).astype("float32")
labelDF=pd.DataFrame({'label':label})
sns.countplot(x='label', data=labelDF)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data,label, test_size=0.20, random_state=1)

In [None]:
from keras.utils import to_categorical
from keras import models
from keras import layers
model = models.Sequential()

In [None]:
model.add(layers.Dense(50, activation = "relu", input_shape=(10000, )))

In [None]:
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

In [None]:
import tensorflow as tf
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [None]:
model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)
from sklearn.model_selection import train_test_split
results = model.fit(
X_train, y_train,
epochs= 2,
batch_size = 500,
validation_data = (X_test, y_test),
callbacks=[callback]
)


In [None]:
print(np.mean(results.history["val_accuracy"]))

In [None]:
score = model.evaluate(X_test, y_test, batch_size=500)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


In [None]:
print(results.history.keys())

In [None]:
import matplotlib.pyplot as plt
plt.plot(results.history['accuracy'])
plt.plot(results.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(results.history['loss'])
plt.plot(results.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()