In [1]:
import random
# from hdflogv2 import HDFSLogv2
from hdflogv3 import HDFSLogv3
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
import pickle
from keras.preprocessing.sequence import pad_sequences
tf.random.set_seed(123)
np.random.seed(123)

In [2]:
def test_model(ablation=1000, batch_size=32, epochs=3, filters=64, kernel_size=3, dense_neurons=2048, 
              conv1d_set1=1, conv1d_set2=1, maxpool_1=False, hdfs_obJ_full_name=None):
    # from hdflogv2 import HDFSLogv2
    import tensorflow as tf
    import numpy as np
    tf.random.set_seed(123)
    import pickle
    with open(hdfs_obJ_full_name, 'rb') as f:
        hdfslogs = pickle.load(f)    
    # x_train, y_train, x_val, y_val, x_test, y_test 
    train_data, val_data, test_data = hdfslogs.get_tensor_train_val_test(ablation=ablation, batch_size=batch_size)    
    print(train_data)
    B = batch_size
    # B = train_data.element_spec[0].shape[0]
    seq_len = train_data.element_spec[0].shape[1]
    char_len = train_data.element_spec[0].shape[2]
    
    tk = hdfslogs.tk    
    vocab_size = len(tk.word_index)
    print(f'vocab_size: {vocab_size}')
    char_onehot = vocab_size
    
    embedding_weights = []
    embedding_weights.append(np.zeros(vocab_size))
    for char, i in tk.word_index.items(): # from 1 to 51
        onehot = np.zeros(vocab_size)
        onehot[i-1] = 1
        embedding_weights.append(onehot)
    embedding_weights = np.array(embedding_weights)
    
    
    
    input_size = [seq_len, char_len]
    embedding_size = vocab_size

    embedding_layer = tf.keras.layers.Embedding(vocab_size+1,
                                                embedding_size,
                                                input_length=input_size,
                                                weights = [embedding_weights])
    
    
    
    
    inputs = tf.keras.layers.Input(batch_shape=(B, seq_len, char_len), dtype='float64' )
    x = tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                    output_dim=embedding_size,
                                    input_length=char_len,
                                    weights = [embedding_weights],
                                    )(inputs)
    for _ in range(conv1d_set1):
        x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
    if maxpool_1:
        x = tf.keras.layers.MaxPooling2D(pool_size=(1, char_len))(x)
        x = tf.reshape(x, (B, seq_len, filters))        
        for _ in range(conv1d_set2):
            x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
        x = tf.keras.layers.MaxPooling1D(pool_size=(seq_len) )(x)    
    if not maxpool_1:
        x = tf.keras.layers.Flatten()(x)       
    x = tf.keras.layers.Dense(dense_neurons)(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    outputs = tf.reshape(outputs, (B, 1))
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    print(model.summary())
    model.compile(optimizer='adam', 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    hist = model.fit(train_data, validation_data=test_data, epochs=epochs) 
    return model, hist

In [None]:
test_model(ablation=10000, B=32, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_176_time_ip.pkl')

train_hdfs_anomaly:, 8000, val_hdfs_anomaly:, 1000, test_hdfs_anomaly:, 1000, train_hdfs_normal:, 8000, val_hdfs_normal:, 1000, test_hdfs_normal:, 1000, train: hdfs_anomaly    8000
hdfs_normal     8000
Name: label, dtype: int64
val: hdfs_anomaly    1000
hdfs_normal     1000
Name: label, dtype: int64
test: hdfs_anomaly    1000
hdfs_normal     1000
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(32, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(32, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(32, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 176
num classes, train_data.element_spec[1].shape:  (32,)
length of val_data: 62
length of train_data - (num_seq_per_cls 

In [3]:
### going back to same setting as exp5

hlog_meta_64_256_no_meta = HDFSLogv3( debug=True )
res_64_256_no_meta = hlog_meta_64_256_no_meta.get_tensor_train_val_test(padded_seq_len=64, padded_char_len=256, 
                                                                        hdfs_rm_time_stamp=True, hdfs_rm_ip_address=True, 
                                                                        train_ratio=0.8, )
hlog_meta_64_256_no_meta.save_hdfs_log_obj()
### checking with a lower dataset
test_model(ablation=1000, B=32, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_64_256_no_meta.pkl')

total number of lines in the log file: 11175629
RAM usage:  100232088
loaded logs in memory in time: 5.029593229293823
loaded cleaned logs with blk_id  in memory: 88.31090927124023
RAM usage:  100232088
loaded cleaned logs without blkid in memory: 0.5669658184051514
RAM usage:  100232088
starting training the tokenizer:
ending tokenizer training: 64.23376607894897
RAM usage:  48
vocabulary size: 42
starting text to number conversion
completed:  0
time : 0.0010180473327636719
completed:  1000000
time : 12.662117958068848
completed:  2000000
time : 24.251131057739258
completed:  3000000
time : 37.0912082195282
completed:  4000000
time : 49.49365592002869
completed:  5000000
time : 61.82384181022644
completed:  6000000
time : 74.26996159553528
completed:  7000000
time : 87.01436758041382
completed:  8000000
time : 99.84939408302307
completed:  9000000
time : 112.75847816467285
completed:  10000000
time : 125.596120595932
completed:  11000000
time : 138.3616623878479
ending text to number 

FileNotFoundError: [Errno 2] No such file or directory: 'data\\hdfsobj_64_256_time_ip.pkl'

In [4]:
test_model(ablation=1000, B=32, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_64_256_no_meta.pkl')

train_hdfs_anomaly:, 800, val_hdfs_anomaly:, 100, test_hdfs_anomaly:, 100, train_hdfs_normal:, 800, val_hdfs_normal:, 100, test_hdfs_normal:, 100, train: hdfs_anomaly    800
hdfs_normal     800
Name: label, dtype: int64
val: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
test: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(32, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(32, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(32, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 256
num classes, train_data.element_spec[1].shape:  (32,)
length of val_data: 6
length of train_data - (num_seq_per_cls * num_class)/

(<keras.engine.functional.Functional at 0x268e27e8af0>,
 <keras.callbacks.History at 0x268e2d81af0>)

In [5]:
# earlier 4000 train data and 4000 test data 
### now 5000 x 0.8 = 4000 train data and 500 val , 500 test
## That way the training accuracy should be same , val accuracy should be better
test_model(ablation=5000, B=32, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_64_256_no_meta.pkl')

train_hdfs_anomaly:, 4000, val_hdfs_anomaly:, 500, test_hdfs_anomaly:, 500, train_hdfs_normal:, 4000, val_hdfs_normal:, 500, test_hdfs_normal:, 500, train: hdfs_anomaly    4000
hdfs_normal     4000
Name: label, dtype: int64
val: hdfs_anomaly    500
hdfs_normal     500
Name: label, dtype: int64
test: hdfs_anomaly    500
hdfs_normal     500
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(32, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(32, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(32, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 256
num classes, train_data.element_spec[1].shape:  (32,)
length of val_data: 31
length of train_data - (num_seq_per_cls * num_cl

(<keras.engine.functional.Functional at 0x268e0b9bbb0>,
 <keras.callbacks.History at 0x268e0b945e0>)

In [6]:
hlog_meta_32_256_no_meta = HDFSLogv3( debug=True )
res_32_256_no_meta = hlog_meta_32_256_no_meta.get_tensor_train_val_test(padded_seq_len=32, padded_char_len=256, 
                                                                        hdfs_rm_time_stamp=True, hdfs_rm_ip_address=True, 
                                                                        train_ratio=0.8, )
hlog_meta_32_256_no_meta.save_hdfs_log_obj()
### checking with a lower dataset
test_model(ablation=4000, B=32, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_256_no_meta.pkl')

total number of lines in the log file: 11175629
RAM usage:  100232088
loaded logs in memory in time: 4.933324337005615
loaded cleaned logs with blk_id  in memory: 85.37875890731812
RAM usage:  100232088
loaded cleaned logs without blkid in memory: 0.5400216579437256
RAM usage:  100232088
starting training the tokenizer:
ending tokenizer training: 69.99258422851562
RAM usage:  48
vocabulary size: 42
starting text to number conversion
completed:  0
time : 0.0010149478912353516
completed:  1000000
time : 12.627539873123169
completed:  2000000
time : 24.247183799743652
completed:  3000000
time : 37.220115661621094
completed:  4000000
time : 49.73269605636597
completed:  5000000
time : 62.300745725631714
completed:  6000000
time : 74.95747375488281
completed:  7000000
time : 87.89728498458862
completed:  8000000
time : 100.81793713569641
completed:  9000000
time : 113.7482979297638
completed:  10000000
time : 126.63212776184082
completed:  11000000
time : 139.6336259841919
ending text to nu

(<keras.engine.functional.Functional at 0x271cdf9cd60>,
 <keras.callbacks.History at 0x268e0ba0700>)

In [7]:
hlog_meta_32_256_time_ip = HDFSLogv3( debug=True )
res_32_256_time_ip = hlog_meta_32_256_time_ip.get_tensor_train_val_test(padded_seq_len=32, padded_char_len=256, 
                                                                        hdfs_rm_time_stamp=False, hdfs_rm_ip_address=False, 
                                                                        train_ratio=0.8, )
hlog_meta_32_256_time_ip.save_hdfs_log_obj()
### checking with a lower dataset
test_model(ablation=4000, B=32, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_256_time_ip.pkl')

total number of lines in the log file: 11175629
RAM usage:  100232088
loaded logs in memory in time: 5.099388360977173
loaded cleaned logs with blk_id  in memory: 202.86901998519897
RAM usage:  100232088
loaded cleaned logs without blkid in memory: 0.5159964561462402
RAM usage:  100232088
starting training the tokenizer:
ending tokenizer training: 104.92351031303406
RAM usage:  48
vocabulary size: 42
starting text to number conversion
completed:  0
time : 0.0009996891021728516
completed:  1000000
time : 16.80386734008789
completed:  2000000
time : 32.978227615356445
completed:  3000000
time : 49.299583435058594
completed:  4000000
time : 66.06988310813904
completed:  5000000
time : 82.42038655281067
completed:  6000000
time : 99.1083152294159
completed:  7000000
time : 115.91031098365784
completed:  8000000
time : 132.6258533000946
completed:  9000000
time : 149.34427404403687
completed:  10000000
time : 165.959894657135
completed:  11000000
time : 182.95886516571045
ending text to num

(<keras.engine.functional.Functional at 0x271d56be5e0>,
 <keras.callbacks.History at 0x271ce5fc490>)

In [None]:
test_model(ablation=8000, B=32, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_256_time_ip.pkl')

train_hdfs_anomaly:, 6400, val_hdfs_anomaly:, 800, test_hdfs_anomaly:, 800, train_hdfs_normal:, 6400, val_hdfs_normal:, 800, test_hdfs_normal:, 800, train: hdfs_anomaly    6400
hdfs_normal     6400
Name: label, dtype: int64
val: hdfs_anomaly    800
hdfs_normal     800
Name: label, dtype: int64
test: hdfs_anomaly    800
hdfs_normal     800
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(32, 32, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(32, 32, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(32, 32, 256), dtype=tf.int32, name=None), TensorSpec(shape=(32,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 256
num classes, train_data.element_spec[1].shape:  (32,)
length of val_data: 50
length of train_data - (num_seq_per_cls * num_cl