In [2]:
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
import tensorflow as tf

In [4]:
!unzip -q //content/drive/MyDrive/datasets/IMDBDataset.csv.zip
!ls

 drive	'IMDB Dataset.csv'   sample_data


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/IMDB Dataset.csv', encoding='latin-1')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
test_idx = np.random.randint(0, high=len(df), size=10000)
train = df.drop(test_idx, axis=0)
test = df.loc[test_idx]
X_train, y_train = train.pop('review'), train.pop('sentiment')
X_test, y_test = test.pop('review'), test.pop('sentiment')

In [8]:
X_train

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
5        Probably my all-time favorite movie, a story o...
                               ...                        
49993    Robert Colomb has two full-time jobs. He's kno...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 40946, dtype: object

In [9]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(df.iloc[:, 0].values)

In [10]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [11]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

# train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [12]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'with', 'for', 'movie', 'but'],
      dtype='<U14')

In [13]:
model = tf.keras.Sequential([encoder,
                             tf.keras.layers.Embedding(
                                 input_dim=len(encoder.get_vocabulary()),
                                 output_dim=64,
                                 # Use masking to handle the variable sequence lengths
                                 mask_zero=True
                             ),
                             tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, dropout=.3,return_sequences=True)),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=.3)),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1)
                            ])

In [14]:
# predict on a sample text without padding.

sample_text = ('The movie was very best')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.00533431]


In [3]:
# predict on a sample text with padding
padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

NameError: ignored

In [16]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 64)          64000     
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         49920     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 216,897
Trainable params: 216,897
Non-trainable params: 0
__________________________________________________

In [None]:
history = model.fit(X_train.values, pd.factorize(y_train)[0], epochs=10,
#                     validation_data=test_dataset,  
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [19]:
pd.factorize(y_test)[0]

array([0, 0, 0, ..., 0, 1, 0])

In [20]:
test_loss, test_acc = model.evaluate(X_test,pd.factorize(y_test)[0])

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 0.3983665108680725
Test Accuracy: 0.8489000201225281


In [1]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history, 'loss')
plt.ylim(0,None)

NameError: ignored

In [22]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))

In [23]:
predictions

array([[0.33877072]], dtype=float32)