In [2]:
from char_hdfs import HDFSLog
import tensorflow as tf
import numpy as np
tf.random.set_seed(123)

In [3]:
chars_in_line = 256
lines_in_seq = 64

In [4]:
hdfslog = HDFSLog(padded_char_len=chars_in_line, 
                  padded_seq_len=lines_in_seq,
                 train_ratio=0.9)
x_train, y_train, x_test, y_test, tk = hdfslog.get_train_test_data(ablation=100)

total number of lines in the log file: 11175629
starting training the tokenizer:
ending tokenizer training: 231.57511067390442
vocabulary size: 51
vocabulary size: 51
starting text to number conversion
ending text to number conversion: 207.94819831848145
ending padding characters: 58.48859786987305
padded_txt_to_num shape: (11175629, 256)
completed:  0
ending blk sequencing: 0.0010023117065429688
completed:  1000000
ending blk sequencing: 1.8446159362792969
completed:  2000000
ending blk sequencing: 3.6759893894195557
completed:  3000000
ending blk sequencing: 5.741600275039673
completed:  4000000
ending blk sequencing: 7.617547035217285
completed:  5000000
ending blk sequencing: 9.592900514602661
completed:  6000000
ending blk sequencing: 11.543816328048706
completed:  7000000
ending blk sequencing: 13.49677300453186
completed:  8000000
ending blk sequencing: 15.458702802658081
completed:  9000000
ending blk sequencing: 17.349704265594482
completed:  10000000
ending blk sequencing: 19

In [5]:
vocab_size = len(tk.word_index)
print(f'vocab_size: {vocab_size}')
char_onehot = vocab_size

vocab_size: 51


In [6]:
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items(): # from 1 to 51
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

In [7]:
print(embedding_weights.shape) # first row all 0 for PAD and last row for UNK
embedding_weights

(52, 51)


array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [8]:
embedding_weights[4]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
input_size = [64, 256]
embedding_size = 51

In [10]:
embedding_layer = tf.keras.layers.Embedding(vocab_size+1,
                                            embedding_size,
                                            input_length=input_size,
                                            weights = [embedding_weights])

In [11]:
embedding_layer

<tensorflow.python.keras.layers.embeddings.Embedding at 0x21e30daac40>

In [12]:
print(x_train.shape)
print(y_train.shape)

(200, 64, 256)
(200,)


In [13]:
print(f'features: {x_train[0]}, label: {y_train[0]}')

features: [[ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]], label: 0


In [20]:
for x, y in zip(x_train, y_train):
    print(f'features: {x}, label: {y}')

features: [[ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]], label: 0
features: [[ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]], label: 1
features: [[ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]], label: 1
features: [[ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]], label: 0
features: [[ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]], label: 1
features: [[ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 [ 4 12  3 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0 

In [14]:
tflogs = tf.data.Dataset.from_tensor_slices((x_train, y_train))
tflogs

<TensorSliceDataset shapes: ((64, 256), ()), types: (tf.int32, tf.int64)>

In [14]:
# tflogs = tflogs.map(lambda x, y: (tf.one_hot(x, depth=char_onehot), y))
# tflogs

In [63]:
# tflogs = tflogs.map(lambda x, y: (tf.reshape(x, (64, 13056)), y))
# tflogs

In [None]:
# tflogs = tflogs.map(lambda x_batch, y_batch: (tf.one_hot(x_batch, depth=char_onehot), y_batch))
# tflogs

In [15]:
batch_size = 32
tflogs = tflogs.batch(batch_size)
tflogs

<BatchDataset shapes: ((None, 64, 256), (None,)), types: (tf.int32, tf.int64)>

In [65]:
# tflogs= tflogs.shuffle(64)

In [None]:
# tflogs = tflogs.map(lambda x_batch, y_batch: (tf.one_hot(x_batch, depth=51), y_batch))
# tflogs

In [16]:
for x in tflogs.take(1):
    print(x[0].shape,  x[1].shape)

(32, 64, 256) (32,)


In [17]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
B = 32
tflogs = tf.data.Dataset.from_tensor_slices((x_train, y_train))
tflogs = tflogs.shuffle(buffer_size=len(y_train))
tflogs = tflogs.batch(B, drop_remainder=True)
tflogs = tflogs.prefetch(buffer_size=AUTOTUNE)

In [18]:
print('################ Character  Embedding##############################')
inputs = tf.keras.layers.Input(batch_shape=(B, 64, 256), dtype='float64' )
x = tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                output_dim=embedding_size,
                                input_length=256,
                                weights = [embedding_weights],
                                )(inputs)
#At first, we embeds all characters into an arbitraryvector space with the dimension of𝐶𝑒, which is the character em-bedding matrix’s dimensions, producing an intermediate tensorshape of(𝐵,𝐿𝑠,𝐿𝑙,𝐶𝑒).
print('input data with char embedding: - (𝐵,𝐿𝑠,𝐿𝑙,𝐶𝑒)', x.shape)
print('################ Log event  Embedding##############################')
x = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='same')(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='same')(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='same')(x)
# x = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(x)
print('after conv : -  (𝐵,𝐿𝑠,𝐿𝑙,𝐶𝑙)', x.shape)
# The output is then aggregated by taking the maximums in the third dimension,delivering the intermediate tensor shape of(𝐵,𝐿𝑠,𝐶𝑙)
#(𝐵,𝐿𝑠,𝐶𝑙)
# x = tf.keras.layers.Maximum
x = tf.keras.layers.MaxPooling2D(pool_size=(1, 256) )(x)
print('max pooling in the 3rd dimension (𝐵,𝐿𝑠,1, 𝐶𝑙)', x.shape)
x = tf.reshape(x, (B, 64, 64))
print('(𝐵,𝐿𝑠,𝐶𝑙): -', x.shape)

print('################ Seqence Embedding##############################')
x = tf.keras.layers.Conv1D(filters=256, kernel_size=3, padding='same', )(x)
x = tf.keras.layers.Conv1D(filters=256, kernel_size=3, padding='same', )(x)
x = tf.keras.layers.Conv1D(filters=256, kernel_size=3, padding='same', )(x)
# x = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(x)
print('after conv : -  (𝐵,𝐿𝑠,𝐶𝑠))', x.shape)
x = tf.keras.layers.MaxPooling1D(pool_size=(64) )(x)
print('max pooling in the 3rd dimension (𝐵,1, 𝐶𝑙)', x.shape)
x = tf.keras.layers.Dense(1024)(x)
x = tf.keras.layers.Dense(1024)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
print(model.summary())
model.compile(optimizer='adam', 
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])


################ Character  Embedding##############################
input data with char embedding: - (𝐵,𝐿𝑠,𝐿𝑙,𝐶𝑒) (32, 64, 256, 51)
################ Log event  Embedding##############################
after conv : -  (𝐵,𝐿𝑠,𝐿𝑙,𝐶𝑙) (32, 64, 256, 64)
max pooling in the 3rd dimension (𝐵,𝐿𝑠,1, 𝐶𝑙) (32, 64, 1, 64)
(𝐵,𝐿𝑠,𝐶𝑙): - (32, 64, 64)
################ Seqence Embedding##############################
after conv : -  (𝐵,𝐿𝑠,𝐶𝑠)) (32, 64, 256)
max pooling in the 3rd dimension (𝐵,1, 𝐶𝑙) (32, 1, 256)
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(32, 64, 256)]           0         
_________________________________________________________________
embedding_1 (Embedding)      (32, 64, 256, 51)         2652      
_________________________________________________________________
conv1d (Conv1D)              (32, 64, 256, 64)         9856      
__________________

In [19]:
model.fit(tflogs, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x224efa4ab20>

In [124]:
emb = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size+1,
                                embedding_size,
                                input_length=256,
                                weights = [embedding_weights],
                                input_shape=[64, 256 ]),    
])
# print(model.summary())
event_emb = tf.keras.layers.Conv1D(filters=64, kernel_size=3)(emb)
# model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
#               optimizer='adam', metrics=['accuracy'])
# model.fit(tflogs, epochs=1)

AttributeError: 'Sequential' object has no attribute 'shape'

In [122]:
# input_shape=[57, 230, 51])
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size+1,
                                embedding_size,
                                input_length=256,
                                weights = [embedding_weights],
                                input_shape=[64, 256 ]),
    # tf.keras.layers.Conv1D(64, 3, padding='same',  input_shape=[64, 256, 51]),
    tf.keras.layers.Conv1D(64, 3, padding='same', ),
    # tf.keras.layers.Maximum(),
    # tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1)
])
print(model.summary())
# model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
#               optimizer='adam', metrics=['accuracy'])
# model.fit(tflogs, epochs=1)

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_30 (Embedding)     (None, 64, 256, 51)       2652      
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 64, 256, 64)       9856      
_________________________________________________________________
flatten_23 (Flatten)         (None, 1048576)           0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 1048577   
Total params: 1,061,085
Trainable params: 1,061,085
Non-trainable params: 0
_________________________________________________________________
None


In [76]:
model.fit(tflogs, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x1761a36e640>

In [None]:
m2 = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(filters=8, kernel_size=64, strides=1, padding='causal',
                           input_shape=[lines_in_seq, chars_in_line]), 
    # tf.keras.layers.Conv1D(filters=16, kernel_size=3, strides=1, padding='causal',
    #                        ),
    # tf.keras.layers.Conv1D(filters=32, kernel_size=2, strides=1, padding='causal',
                           # ),
    tf.keras.layers.Flatten(), 
    tf.keras.layers.Dense(1)
])
print(m2.summary())
m2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer='adam', metrics=['accuracy'])
history = m2.fit(tflogs, epochs=3)

In [None]:
model_1 = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(filters=8, kernel_size=6,  input_shape=[lines_in_seq, chars_in_line]),
    # tf.keras.layers.Conv1D(filters=256, kernel_size=2, padding='same', strides=1, activation='relu'),
    # # tf.keras.layers.MaxPooling1D(pool_size=2),
    # tf.keras.layers.MaxPooling1D(),
    # tf.keras.layers.Conv1D(filters=256, kernel_size=2, padding='same', strides=1, activation='relu'),
    # tf.keras.layers.Conv1D(filters=1024, kernel_size=2,  input_shape=[lines_in_seq, chars_in_line]),
    # tf.keras.layers.Conv1D(filters=1024, kernel_size=2,  input_shape=[lines_in_seq, chars_in_line]),
    # tf.keras.layers.Conv1D(filters=1024, kernel_size=2,  input_shape=[lines_in_seq, chars_in_line]),
    # tf.keras.layers.Conv1D(filters=1024, kernel_size=2,  input_shape=[lines_in_seq, chars_in_line]),
    # tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Flatten(),      
    # tf.keras.layers.Dense(100,),
    tf.keras.layers.Dense(1)
])
print(model_1.summary())
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer='adam', metrics=['accuracy'])
history = model_1.fit(tflogs, epochs=3)

WARNING:tensorflow:Model was constructed with shape (None, 57, 231, 51) for input Tensor("conv1d_10_input:0", shape=(None, 57, 231, 51), dtype=float32), but it was called on an input with incompatible shape (None, 57, 230, 51).
WARNING:tensorflow:Model was constructed with shape (None, 57, 231, 51) for input Tensor("conv1d_10_input:0", shape=(None, 57, 231, 51), dtype=float32), but it was called on an input with incompatible shape (None, 57, 230, 51).

InvalidArgumentError:  Incompatible shapes: [32,57,229,1] vs. [32,1]

ValueError: logits and labels must have the same shape ((None, 57, 229, 1) vs (None, 1))

InvalidArgumentError:  Incompatible shapes: [32,1] vs. [32,57,229,1]

ValueError: Input 0 of layer max_pooling1d is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: [None, 64, 254, 128]