<a href="https://colab.research.google.com/github/BillFei/tensorflow/blob/master/keras-text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install -q tensorflow==2.0.0-beta1
!pip install numpy==1.16.2



In [32]:


from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import numpy as np
from tensorflow import keras

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.datasets import imdb



print(tf.__version__)
print(keras.__version__)
print(np.__version__)

2.0.0-beta1
2.2.4-tf
1.16.2


In [0]:
def create_ngram_set(input_list, ngram_value):
  return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):
  new_sequences = []
  for input_list in sequences:
    new_list = input_list[:]
    for ngram_value in range(2, ngram_range + 1):
      for i in range(len(new_list)-ngram_value+1):
        ngram = tuple(new_list[i:i+ngram_value])
        if ngram in token_indice:
          new_list.append(token_indice[ngram])
    new_sequences.append(new_list)
  return new_sequences



In [34]:
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train),'train sequences')
print(len(x_test),'test sequences')
print('Average train sequence length: {}'.format(
    np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(
    np.mean(list(map(len, x_test)), dtype=int)))

Loading data...
25000 train sequences
25000 test sequences
Average train sequence length: 238
Average test sequence length: 230


In [0]:
if ngram_range > 1:
  print('Adding {}-gram features'.format(ngram_range))
  ngram_set = set()
  for input_list in x_train:
    for i in range(2, ngram_range+1):
      set_of_ngram = create_ngram_set(input_list, ngram_value=i)
      ngram_set.update(set_of_ngram)
  
  start_index = max_features+1
  token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
  indice_token = {token_indice[k]:k for k in token_indice}
  
  max_features = np.max(list(indice_token.keys())) + 1
  
  x_train = add_ngram(x_train, token_indice, ngram_range)
  x_test = add_ngram(x_test, token_indice, ngram_range)
  
  print('Average train sequence length: {}'.format(
        np.mean(list(map(len, x_train)), dtype=int)))
  print('Average test sequence length: {}'.format(
        np.mean(list(map(len, x_test)), dtype=int)))

In [35]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test,maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [37]:
print('Build model...')
model = Sequential()

model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test))


Build model...


W0731 12:13:22.461377 139981818615680 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f4fbf798080>

In [38]:
predictions = model.predict(x_test)
predictions[0]

array([0.20629251], dtype=float32)