## Keras `IMDB` dataset.
* This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). Reviews have been preprocessed, and each review is encoded as a list of word indexes (integers). 

In [61]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import datasets
from tensorflow import keras

In [2]:
dir(datasets)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_sys',
 'boston_housing',
 'cifar10',
 'cifar100',
 'fashion_mnist',
 'imdb',
 'mnist',
 'reuters']

> Loading the data.

In [4]:
imdb = datasets.imdb.load_data(num_words=10000)

In [9]:
(X_train, y_train),(X_test, y_test) = imdb

In [12]:
print(X_train[0])
y_train[:2]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


array([1, 0], dtype=int64)

> Let's join the train and test sets.

In [46]:
X = np.concatenate([X_train, X_test])
y = np.concatenate([y_train, y_test])
X, y

(array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
        list([1, 194, 1153, 194, 8255, 78, 22

> So the `X_train[0]` is just a list of integers that doesn't make sense to a human for now, but we can say it
is a positive review about the movie according to the label we get.

In [7]:
word_indices = datasets.imdb.get_word_index()
word_indices

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

> Let's create a function that decords integers lists into sentences.

In [8]:
word_indices_reversed = dict([(value, key) for (key, value) in word_indices.items()])
word_indices_reversed

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [41]:
def decord(sent):
    INDEX_FROM=3
    return " ".join([word_indices_reversed.get(i - INDEX_FROM, '#') for i in sent[0]])+"..."

In [47]:
decord([X[1]])

"# big hair big boobs bad music and a giant safety pin these are the words to best describe this terrible movie i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal # the hair is big lots of boobs # men wear those cut # shirts that show off their # sickening that men actually wore them and the music is just # trash that plays over and over again in almost every scene there is trashy music boobs and # taking away bodies and the gym still doesn't close for # all joking aside this is a truly bad film whose only charm is to look back on the disaster that was the 80's and have a good old laugh at how bad everything was back then..."

> Lets create a function that will encode a given sentence to `word_embedings_list`.

In [43]:
def encode(sent):
    pass

> "Data preparation".

> We want to preapare the sentences to have a same width. This is sometimes called `pad_sequencing` we are just make all sentences to have the same width by trancating long sentencs and appending 0 to shorter sentences.

In [51]:
def vectorize(sequences, dim=10000):
    res = np.zeros((len(sequences), dim))
    for i, seq in enumerate(sequences):
        res[i, seq] = 1
    return res

In [52]:
X_data = vectorize(X)

In [53]:
X_data[0], len(X_data[0]), len(X_data[1])

(array([0., 1., 1., ..., 0., 0., 0.]), 10000, 10000)

> Converting the `X_data` and `y` to tensorflow_tensors.

In [57]:
X_tensors = tf.convert_to_tensor(X_data)
y_tensors = tf.convert_to_tensor(y)

In [60]:
y_tensors, X_tensors, y_tensors.shape, X_tensors.shape

(<tf.Tensor: shape=(50000,), dtype=int64, numpy=array([1, 0, 0, ..., 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(50000, 10000), dtype=float64, numpy=
 array([[0., 1., 1., ..., 0., 0., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        ...,
        [0., 1., 1., ..., 0., 0., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        [0., 1., 1., ..., 0., 0., 0.]])>,
 TensorShape([50000]),
 TensorShape([50000, 10000]))

> Creating a `Functional NN`

### A `FF-Neural-Net`

In [75]:
X_tensors[0].shape

TensorShape([10000])

In [86]:
input_layer = keras.layers.Input(shape=(10000, ), name="input_shape")
hl_1 = keras.layers.Dense(32, activation="relu", name="hl_1")(input_layer)
hl_2 = keras.layers.Dense(64, activation="relu", name="hl_2")(hl_1)
hl_3 = keras.layers.Dense(128, activation="relu", name="hl_2")(hl_2)
dropout_layer = keras.layers.Dropout(0.3, noise_shape=None, seed=None, name="dropout_layer")(hl_3)
hl_4 = keras.layers.Dense(256, activation="relu", name="hl_4")(dropout_layer)
hl_5 = keras.layers.Dense(128, activation="relu", name="hl_5")(hl_4)
hl_6 = keras.layers.Dense(64, activation="relu", name="hl_6")(hl_5)
dropout_layer_1 = keras.layers.Dropout(0.3, noise_shape=None, seed=None, name="dropout_layer_1")(hl_6)
hl_7 = keras.layers.Dense(32, activation="relu", name="hl_7")(dropout_layer_1)
output_layer = keras.layers.Dense(1, activation="sigmoid", name="output_layer")(hl_7)

model = keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(
    loss = keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=["acc"]
)

NameError: name 'shape' is not defined

In [85]:
model.fit(
    X_tensors, y_tensors, epochs=2, validation_split=.3, batch_size=256
)

Epoch 1/2


ValueError: in user code:

    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:754 train_step
        y_pred = self(x, training=True)
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\crisp\Documents\anaconda3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:234 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_6 is incompatible with the layer: : expected min_ndim=3, found ndim=2. Full shape received: (None, 10000)


In [69]:
predictions = model.predict(X_tensors[:5])

In [74]:

predictions= tf.squeeze(tf.round(predictions))
predictions, y_tensors[:5]

(<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 0., 1., 0.], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 1, 0], dtype=int64)>)