In [57]:
import tensorflow as tf
print(tf.__version__)

from tensorflow import feature_column

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku
from tensorflow.keras.utils import plot_model

import pandas as pd
from sklearn.model_selection import train_test_split

2.0.0-beta1


In [58]:
DATA_PATH = 'C:\SoloLearnMachineLearning\Stackoverflow\TextDataset.csv'

#it is just two column csv, like:
# text;label
# A wiki is run using wiki software;0
# otherwise known as a wiki engine.;1

dataframe = pd.read_csv(DATA_PATH, delimiter = ';')
dataframe.head()

Unnamed: 0,text,label
0,A wiki is run using wiki software,0
1,otherwise known as a wiki engine.,1
2,A wiki engine is a type of content management ...,1
3,"but it differs from most other such systems,in...",0
4,in that the content is created without any def...,0


In [59]:
# Preprocessing before feature_clolumn includes
# - getting the vocabulary
# - tokenization, which means only splitting on tokens. Encoding sentences with vocablary will be done by feature_column!
# - padding
# - truncating

# Build vacabulary
vocab_size = 100
oov_tok = '<OOV>'

sentences = dataframe['text'].to_list()

tokenizer = Tokenizer(num_words = vocab_size, oov_token="<OOV>")

tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# if word_index shorter then default value of vocab_size we'll save actual size
vocab_size=len(word_index)
print("vocab_size = word_index = ",len(word_index))

vocab_size = word_index =  59


In [60]:
# Split sentensec on tokens. here token = word
# text_to_word_sequence() has good default filter for charachters include basic punctuation, tabs, and newlines
dataframe['text'] = dataframe['text'].apply(tf.keras.preprocessing.text.text_to_word_sequence)

In [61]:
dataframe.head()

Unnamed: 0,text,label
0,"[a, wiki, is, run, using, wiki, software]",0
1,"[otherwise, known, as, a, wiki, engine]",1
2,"[a, wiki, engine, is, a, type, of, content, ma...",1
3,"[but, it, differs, from, most, other, such, sy...",0
4,"[in, that, the, content, is, created, without,...",0


In [62]:
max_length = 6

# paddind and trancating setnences
# do that directly with strings without using tokenizer.texts_to_sequences()
# the feature_colunm will convert strings into numbers
dataframe['text']=dataframe['text'].apply(lambda x, N=max_length: (x + N * [''])[:N])
dataframe['text']=dataframe['text'].apply(lambda x, N=max_length: x[:N])
dataframe.head()

Unnamed: 0,text,label
0,"[a, wiki, is, run, using, wiki]",0
1,"[otherwise, known, as, a, wiki, engine]",1
2,"[a, wiki, engine, is, a, type]",1
3,"[but, it, differs, from, most, other]",0
4,"[in, that, the, content, is, created]",0


In [63]:
# Define method to create tf.data dataset from Pandas Dataframe
def df_to_dataset(dataframe, label_column, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    #labels = dataframe.pop(label_column)
    labels = dataframe[label_column]

    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [64]:
# Split dataframe into train and validation sets
train_df, val_df = train_test_split(dataframe, test_size=0.2)

print(len(train_df), 'train examples')
print(len(val_df), 'validation examples')

7 train examples
2 validation examples


In [101]:
batch_size = 32
ds = df_to_dataset(dataframe, 'label',shuffle=False,batch_size=batch_size)

train_ds = df_to_dataset(train_df, 'label', batch_size=batch_size)
val_ds = df_to_dataset(val_df, 'label', shuffle=False, batch_size=batch_size)

In [102]:
# and small batch for demo
example_batch = next(iter(ds))[0]
example_batch

{'text': <tf.Tensor: id=8795, shape=(9, 6), dtype=string, numpy=
 array([[b'a', b'wiki', b'is', b'run', b'using', b'wiki'],
        [b'otherwise', b'known', b'as', b'a', b'wiki', b'engine'],
        [b'a', b'wiki', b'engine', b'is', b'a', b'type'],
        [b'but', b'it', b'differs', b'from', b'most', b'other'],
        [b'in', b'that', b'the', b'content', b'is', b'created'],
        [b'and', b'wikis', b'have', b'little', b'inherent', b'structure'],
        [b'allowing', b'structure', b'to', b'emerge', b'according', b'to'],
        [b'there', b'are', b'dozens', b'of', b'different', b'wiki'],
        [b'both', b'standalone', b'and', b'part', b'of', b'other']],
       dtype=object)>,
 'label': <tf.Tensor: id=8794, shape=(9,), dtype=int32, numpy=array([0, 1, 1, 0, 0, 1, 0, 1, 0])>}

In [80]:
# Helper method to print exxample outputs of for defined feature_column

def demo(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())
    
def seqdemo(feature_column):
    feature_layer = tf.keras.experimental.SequenceFeatures(feature_column)
    print(feature_layer(example_batch))

In [81]:
# Define categorical colunm for our text feature, which is preprocessed into lists of tokens
# Note that key name should be the same as original column name in dataframe
text_column = feature_column.sequence_categorical_column_with_vocabulary_list(key='text', 
                                                                     vocabulary_list=list(word_index))

# arguemnt dimention here is exactly the dimension of the space in which tokens will be presented during model's learning
# see the tutorial at https://www.tensorflow.org/beta/tutorials/text/word_embeddings
text_embedding = feature_column.embedding_column(text_column, dimension=8)
print(seqdemo(text_embedding))

(<tf.Tensor: id=5218, shape=(9, 6, 8), dtype=float32, numpy=
array([[[ 0.02763646,  0.46926948,  0.16602302,  0.41979697,
          0.10070847, -0.25982472,  0.56686956,  0.24198762],
        [-0.17580262, -0.0545252 ,  0.11080994, -0.01236732,
         -0.08224947,  0.28997687, -0.44878346,  0.23482987],
        [ 0.25610265, -0.28358257,  0.4709215 ,  0.36946535,
         -0.04738319, -0.37916708, -0.6918726 ,  0.15008691],
        [-0.10372087, -0.15801448,  0.17832626, -0.00092938,
         -0.35034904, -0.42338422,  0.2239229 ,  0.31570607],
        [-0.52049744,  0.4207177 ,  0.06991487,  0.25887436,
         -0.4963163 ,  0.29882333,  0.25331378, -0.25571042],
        [-0.17580262, -0.0545252 ,  0.11080994, -0.01236732,
         -0.08224947,  0.28997687, -0.44878346,  0.23482987]],

       [[-0.4570017 , -0.29725   , -0.26551938, -0.31475785,
          0.3716718 ,  0.5617713 , -0.25350043, -0.3703429 ],
        [-0.6737758 , -0.46808305, -0.70588297,  0.32659894,
          0.489

In [87]:
# The define the layers and model it self
# This example uses Keras Functional API instead of Sequential just for more generallity

# Define SequenceFeatures layer to pass feature_columns into Keras model
sequence_feature_layer = tf.keras.experimental.SequenceFeatures(text_embedding)

In [97]:
# Define inputs for each feature column. See
# см. https://github.com/tensorflow/tensorflow/issues/27416#issuecomment-502218673
feature_layer_inputs = {}
sequence_feature_layer_inputs = {}

# Here we have just one column

sequence_feature_layer_inputs['text'] = tf.keras.Input(shape=(max_length,), name='text', dtype=tf.string)
print(sequence_feature_layer_inputs)

{'text': <tf.Tensor 'text_8:0' shape=(None, 6) dtype=string>}


In [98]:
sequence_feature_layer_outputs, _ = sequence_feature_layer(sequence_feature_layer_inputs)
print(sequence_feature_layer_outputs)

Tensor("sequence_features_1_6/Identity:0", shape=(None, None, 8), dtype=float32)


In [99]:
# Define outputs of SequenceFeatures layer 
# And accually use them as first layer of the model

# note here that SequenceFeatures layer produce tuple of two tensors as output. We need just first to pass next.
sequence_feature_layer_outputs, _ = sequence_feature_layer(sequence_feature_layer_inputs)
print(sequence_feature_layer_outputs)
# Add consequences layers. See https://keras.io/getting-started/functional-api-guide/

# Conv1D and MaxPooling1D will learn features from words order
x = tf.keras.layers.Conv1D(8,4)(sequence_feature_layer_outputs)
x = tf.keras.layers.MaxPooling1D(2)(x)

x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)

# This example supposes binary classification, as labels are 0 or 1
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs=[v for v in sequence_feature_layer_inputs.values()], outputs=x)

model.summary()

# This example supposes binary classification, as labels are 0 or 1
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']
              #run_eagerly=True
             )

Tensor("sequence_features_1_7/Identity:0", shape=(None, None, 8), dtype=float32)
Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            [(None, 6)]               0         
_________________________________________________________________
sequence_features_1 (Sequenc ((None, None, 8), (None,) 472       
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 8)           264       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, None, 8)           0         
_________________________________________________________________
dense_18 (Dense)             (None, None, 256)         2304      
_________________________________________________________________
dropout_9 (Dropout)          (None, None, 256)         0         
____________________________________________

In [106]:
# Note that fit() method looking up features in train_ds and valdation_ds by name in 
# tf.keras.Input(shape=(max_length,), name='text'

# This model of cause will learn nothing because of fake data.

num_epochs = 5
history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=num_epochs,
                    verbose=1
                    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
