In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

## Part 1: Download the Data

In [None]:
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10_000)

In [None]:
train_data.shape, train_labels.shape, test_data.shape, test_labels.shape

In [None]:
print(train_data[0])

In [None]:
def convert_to_english(list_of_integers):
    word_index = tf.keras.datasets.imdb.get_word_index()
    # reverse the word index
    reverse_word_index = {idx : word for (word, idx) in word_index.items() }
    # map each integer to a word and join all words together
    # Index 0, 1 and 2 are reserved for 'padding', 'start of sequence', and 'unknown'
    return " ".join(reverse_word_index.get(idx - 3, "?") for idx in list_of_integers)

convert_to_english(train_data[0])

## Part 2: Prepare the Data

In [None]:
def convert_to_multi_hot(sequences, dimension):
    output = np.zeros(shape=(sequences.shape[0], dimension), dtype=np.float32)
    
    for i in range(sequences.shape[0]):
        for j in sequences[i]:
            output[i, j] = 1
        
    return output

In [None]:
convert_to_multi_hot(np.array([[1,2,3,1],[0,4,5,4]]), dimension=6)

In [None]:
X_train = convert_to_multi_hot(train_data, dimension=10_000)
X_test = convert_to_multi_hot(test_data, dimension=10_000)
# Rename the labels
y_train = train_labels
y_test = test_labels

In [None]:
# Set aside the first 10_000 examples as validation data
X_val = X_test[:10_000]
y_val = y_test[:10_000]
X_test = X_test[10_000:]
y_test = y_test[10_000:]

## Part 3: Build a Simple Model

In [None]:
tf.keras.backend.clear_session()
def get_model():
    model = tf.keras.models.Sequential()
    
    model.add(tf.keras.layers.Dense(units=16, activation='relu', input_shape=(10_000,)))
    model.add(tf.keras.layers.Dense(units=16, activation='relu'))
    model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
    
    return model

In [None]:
model = get_model()
model.summary()

## Part 4: Compile the Model

In [None]:
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

## Part 5: Fit the Model

In [None]:
history = model.fit(
    X_train, y_train, 
    validation_data =(X_val, y_val), 
    epochs=20,
    batch_size=512
)

In [None]:
def plot_learning_curves(history):
    plt.figure(figsize=(8, 5))
    for key, style in zip(history.history, ["r-o", "r-*", "b-o", "b-*"]):
        epochs = np.array(history.epoch)
        plt.plot(epochs + 1, history.history[key], style, label=key)
    plt.xlabel("Epoch")
    plt.axis([1, len(history.history['loss']), 0., 1])
    plt.legend(loc="lower left")
    plt.grid()

In [None]:
plot_learning_curves(history)

In [None]:
# Only train for 3 epochs
model = get_model()
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics='accuracy'
)
history = model.fit(
    X_train, y_train, 
    validation_data =(X_val, y_val), 
    epochs=3,
    batch_size=512
)

In [None]:
plot_learning_curves(history)

### Step 6: Evaluate on the test set

In [None]:
model.evaluate(X_test, y_test, batch_size=512)

In [None]:
y_test_pred = model(X_test).numpy()

In [None]:
def find_confidently_wrong(y_true, y_pred, top=10):
    """
    y_true: the true labels (0/1). Shape (n, 1)
    y_pred: the predictions (floats). Shape(n, 1)
    
    Returns: list of indices, such that these indices have the highest loss (and they are actually misclassified)
    """
    assert len(y_true.shape) == len(y_pred.shape) == 2, "Rank should be 2"
    assert y_true.shape[0] == y_pred.shape[0], "Not the same length"
    assert y_true.shape[1] == y_pred.shape[1] == 1, "Second dimension should be 1"     
    bce = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    
    losses = bce(y_true, y_pred).numpy()
    high_losses_indices = np.argsort(losses)[-top:]
    high_losses_indices = list(high_losses_indices)[::-1] # reverse it, so that highest loss is first
        
    y_pred_labels = np.where(y_pred > 0.5, 1, 0)
    return [idx for idx in high_losses_indices if y_true[idx] != y_pred_labels[idx]]
    

In [None]:
import pprint

y_test_true = y_test.reshape(-1,1)
for idx in find_confidently_wrong(y_test_true, y_test_pred, top=10):
    actual_sentiment = 'POSITIVE' if y_test_true[idx][0] == 1 else 'NEGATIVE'
    predicted_sentiment = 'POSITIVE' if y_test_pred[idx][0] > 0.5 else 'NEGATIVE'
    # Add 10_000 because we used the first 10_000 elements as validation data
    pprint.pprint(convert_to_english(test_data[10_000 + idx]), width=80)
    print(f"is a {actual_sentiment} review but is was classified as {predicted_sentiment}.")
    print("**************************************")     