In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/"))

# Any results you write to the current directory are saved as output.

['aclimdb']


## Import Libraries

In [2]:
import random
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Define Parameters

In [3]:
VOCAB_SIZE = 10000
EPOCHS = 10
PADDING = 'post'
TRUNC = 'post'
MAXLENGTH = 1024
EMB_DIM = 16
BATCH_SIZE = 1536

## Load Data

### Define directories

In [4]:
directory = "../input/aclimdb/aclImdb/"

train_sent = []
test_sent = []
train_labels = []
test_labels = []


### Load from files

In [7]:
def load_data(train_test, label_type):
    path = os.path.join(directory, train_test, label_type)
    label = 1 if (label_type=='pos') else 0
    dir_list = os.listdir(path)
    for file in dir_list:
        f = open(os.path.join(path,file))
        sentence = f.read()
        if train_test == 'train':
            train_sent.append(sentence)
            train_labels.append(label)
        if train_test == 'test':
            test_sent.append(sentence)
            test_labels.append(label)
        f.close()

load_data('train', 'pos')
load_data('train', 'neg')
load_data('test', 'pos')
load_data('test', 'neg')

print (train_sent[0])
print (train_labels[0])
print (test_sent[0])
print (test_labels[0])

Though this movie has a first rate roster of fine actors, special effects that are excellent, and a story line that is full of surprises, it wasn't picked up for studio distribution and went directly to DVD. Perhaps it contains too much 'anti-police force' information, or perhaps it is juts one too many action flicks released during a glut, but whatever the reason the big screens missed the opportunity, fortunately the new concept of releasing direct to DVD allows us to enjoy it.<br /><br />The theme is old: rookie reporter uncovers an inner circle of cops that are corrupt - in this case the F.R.A.T. (First Response Assault and Tactical) team, a group of well trained policeman created to clean up the mythical city of Edison from its low point of crime, drugs, prostitution etc. Working undercover the temptation of pocketing the confiscated goods and money proves too much of an opportunity and now, 15 years after its formation, FRAT is responsible for murder, drug trafficking, terrorizin

### Tokenize the text data

In [8]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sent)
word_index = tokenizer.word_index

print(len(word_index))

88582


### Sentences to Sequences

In [9]:
train_seq = tokenizer.texts_to_sequences(train_sent)
train_seq = pad_sequences(train_seq, padding=PADDING, truncating=TRUNC, maxlen=MAXLENGTH)

test_seq = tokenizer.texts_to_sequences(test_sent)
test_seq = pad_sequences(test_seq, padding=PADDING, truncating=TRUNC, maxlen=MAXLENGTH)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

print(train_seq.shape)
print(test_seq.shape)
print(train_labels.shape)
print(test_labels.shape)

(25000, 1024)
(25000, 1024)
(25000,)
(25000,)


## Define Model

In [10]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMB_DIM, input_length=MAXLENGTH),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation='tanh'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

## Compile Model

In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1024, 16)          160000    
_________________________________________________________________
bidirectional (Bidirectional (None, 1024, 128)         41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 244,801
Trainable params: 244,801
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [12]:
history = model.fit(train_seq,
                    train_labels,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(test_seq, test_labels),
                    verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
 2048/25000 [=>............................] - ETA: 2:39 - loss: 0.6384 - acc: 0.6919

KeyboardInterrupt: 

## Plot Data

In [13]:
acc = history.history['acc']
val_acc = history.history['val_acc']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(EPOCHS)

plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.xlabel('EPOCHS')
plt.ylabel('Accuracies')
plt.legend('Train Acc', 'Val Acc')
plt.figure()

plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.xlabel('EPOCHS')
plt.ylabel('Losses')
plt.legend('Train Loss', 'Val Loss')
plt.figure()

NameError: name 'history' is not defined