In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import os

In [2]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [3]:
target = df.pop('sentiment')

target和df是Series。df.values是numpy

In [4]:
# Step 1: Create a dataset

ds_raw = tf.data.Dataset.from_tensor_slices(
    (df.values, target.values))

## inspection:
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1])
type(ex[0].numpy()[0])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


bytes

In [5]:
tf.random.set_seed(1)

ds_raw = ds_raw.shuffle(
    50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [6]:
type(ds_raw_test)

tensorflow.python.data.ops.dataset_ops.TakeDataset

In [7]:
## Step 2: find unique tokens (words)

from collections import Counter

try:
    tokenizer = tfds.features.text.Tokenizer()
except AttributeError:
    tokenizer = tfds.deprecated.text.Tokenizer()
    
token_counts = Counter()

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
    
print('Vocab-size:', len(token_counts))

Vocab-size: 87007


In [68]:
print(type(token_counts))

<class 'collections.Counter'>


In [9]:
## Step 3: encoding each unique token into integers

try:
    encoder = tfds.features.text.TokenTextEncoder(token_counts)
except AttributeError:
    encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

example_str = 'This is an example!'
encoder.encode(example_str)

[232, 9, 270, 1123]

In [11]:
## Step 3-A: define the function for transformation

def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

## Step 3-B: wrap the encode function to a TF Op.
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], 
                          Tout=(tf.int64, tf.int64))

In [31]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(3):
    print('Sequence length:', example[0].shape)

print(example[1])    
print(example[0])

Sequence length: (24,)
Sequence length: (179,)
Sequence length: (262,)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(
[15465  7185  1017   212    96    49   534  6664     6    14 18451  7125
    32  1571   308 11258   100    14  8382 15596    80   668  3025   117
   731     9 13897   642     6    14  7742   515    96  7789   318   318
   636     8  2448    96   370  1854   280    14  1280    12   117  4998
 16602    34     6   228  1566     9  7379  3203 12692    76   425  1689
     8  1043  1591    96  1159   841   420   370  1854   280   117  1659
  1410    56  3038   308  4069    96  7379  3203 12692  8067 19255 19380
    39    76  1953    96   173  4973    14  1820   425  1964    32    75
   684   436  2155     8   977   257   148  5571   148   534    13    14
  1679    25  1159  1097     8    75   436  9953     6  4912    13   442
  1681   148     8   283  1138    35   704  8537  1820    32     8   503
     2   436 15438   249   104   156     6    35  6542    96   459    96
    14

In [13]:
# ## Take a small subset

# ds_subset = ds_train.take(8)
# for example in ds_subset:
#     print('Individual size:', example[0].shape)

# ## batching the datasets
# ds_batched = ds_subset.padded_batch(
#     4, padded_shapes=([-1], []))

# for batch in ds_batched:
#     print('Batch dimension:', batch[0].shape)


tensorflow.python.data.ops.dataset_ops.TakeDataset

In [14]:
## batching the datasets
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1],[]))

valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1],[]))

test_data = ds_test.padded_batch(
    32, padded_shapes=([-1],[]))

In [15]:
type(train_data)

tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset

In [16]:
len(token_counts)

87007

# 模型建立

In [47]:
embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

## build the model
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        name='embed-layer'),
    
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, name='lstm-layer'),
        name='bidir-lstm'), 

    tf.keras.layers.Dense(64, activation='relu'),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

bi_lstm_model.summary()

## compile and train:
bi_lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy'])

history = bi_lstm_model.fit(
    train_data, 
    validation_data=valid_data, 
    epochs=10)

## evaluate on the test data
test_results= bi_lstm_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(test_results[1]*100))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1740180   
_________________________________________________________________
bidir-lstm (Bidirectional)   (None, 128)               43520     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,792,021
Trainable params: 1,792,021
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Acc.: 83.84%


In [88]:
if not os.path.exists('models'):
    os.mkdir('models')

# bi_lstm_model.save('models/Bidir-LSTM-full-length-seq.h5')

new_model = tf.keras.models.load_model('models/Bidir-LSTM-full-length-seq.h5')


In [93]:
my = 'the people is so ugly'
a = np.array((encoder.encode(my),))
result = new_model.predict(x=a)


if result[0][0] >= 0.5:
    print('it is a good sentence!')
else:
    print('it is a bad sentence!')

it is a bad sentence!


In [65]:
import json
import pickle 
token_counts
with open('myCounter.pickle', 'wb') as outputfile: 
    pickle.dump(token_counts,outputfile)

with  open('myCounter.pickle', 'wb') as inputfile: 
    pickle.load(inputfile)