In [1]:
import random
# from hdflogv2 import HDFSLogv2
from hdflogv3 import HDFSLogv3
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
import pickle
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
tf.random.set_seed(123)
np.random.seed(123)

In [2]:
def test_model(ablation=1000, batch_size=32, epochs=3, filters=64, kernel_size=3, dense_neurons=2048, 
              conv1d_set1=1, conv1d_set2=1, maxpool_1=False, hdfs_obJ_full_name=None, 
               train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    # from hdflogv2 import HDFSLogv2
    import tensorflow as tf
    import numpy as np
    tf.random.set_seed(123)
    import pickle
    with open(hdfs_obJ_full_name, 'rb') as f:
        hdfslogs = pickle.load(f)    
    # x_train, y_train, x_val, y_val, x_test, y_test 
    train_data, val_data, test_data = hdfslogs.get_tensor_train_val_test(ablation=ablation, batch_size=batch_size, 
                                                                         train_ratio=train_ratio, 
                                                                         val_ratio=val_ratio, 
                                                                         test_ratio=test_ratio)    
    print(train_data)
    B = batch_size
    # B = train_data.element_spec[0].shape[0]
    seq_len = train_data.element_spec[0].shape[1]
    char_len = train_data.element_spec[0].shape[2]
    
    tk = hdfslogs.tk    
    vocab_size = len(tk.word_index)
    print(f'vocab_size: {vocab_size}')
    char_onehot = vocab_size
    
    embedding_weights = []
    embedding_weights.append(np.zeros(vocab_size))
    for char, i in tk.word_index.items(): # from 1 to 51
        onehot = np.zeros(vocab_size)
        onehot[i-1] = 1
        embedding_weights.append(onehot)
    embedding_weights = np.array(embedding_weights)
    
    
    
    input_size = [seq_len, char_len]
    embedding_size = vocab_size

    embedding_layer = tf.keras.layers.Embedding(vocab_size+1,
                                                embedding_size,
                                                input_length=input_size,
                                                weights = [embedding_weights])
    
    
    
    
    inputs = tf.keras.layers.Input(batch_shape=(B, seq_len, char_len), dtype='float64' )
    x = tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                    output_dim=embedding_size,
                                    input_length=char_len,
                                    weights = [embedding_weights],
                                    )(inputs)
    for _ in range(conv1d_set1):
        x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
    if maxpool_1:
        x = tf.keras.layers.MaxPooling2D(pool_size=(1, char_len))(x)
        x = tf.reshape(x, (B, seq_len, filters))        
        for _ in range(conv1d_set2):
            x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
        x = tf.keras.layers.MaxPooling1D(pool_size=(seq_len) )(x)    
    if not maxpool_1:
        x = tf.keras.layers.Flatten()(x)       
    x = tf.keras.layers.Dense(dense_neurons)(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    outputs = tf.reshape(outputs, (B, 1))
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    print(model.summary())
    model.compile(optimizer='adam', 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    ################# Early Stopiing #####################
    monitor_metric = 'val_accuracy'
    LR = ReduceLROnPlateau(monitor=monitor_metric, factor=0.5, patience=2, cooldown=1, verbose=1)
    earlystop = EarlyStopping(monitor=monitor_metric, min_delta=0, patience=3, verbose=1)
    callbacks_list = [ LR, earlystop] 
    hist = model.fit(train_data, validation_data=test_data, epochs=epochs, callbacks=callbacks_list) 
    return model, hist

In [3]:
### early stop
test_model(ablation=5000, batch_size=256, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_64_256_no_meta.pkl')

train_hdfs_anomaly:, 4000, val_hdfs_anomaly:, 500, test_hdfs_anomaly:, 500, train_hdfs_normal:, 4000, val_hdfs_normal:, 500, test_hdfs_normal:, 500, train: hdfs_anomaly    4000
hdfs_normal     4000
Name: label, dtype: int64
val: hdfs_anomaly    500
hdfs_normal     500
Name: label, dtype: int64
test: hdfs_anomaly    500
hdfs_normal     500
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(256, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(256, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(256, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 256
num classes, train_data.element_spec[1].shape:  (256,)
length of val_data: 3
length of train_data - (num_seq_per_cls * 

(<keras.engine.functional.Functional at 0x1cb92595a30>,
 <keras.callbacks.History at 0x1cb925bd850>)

In [3]:
# we can check 64_256 with 4000 train and 4000 test
test_model(ablation=12000, batch_size=256, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_64_256_no_meta.pkl', train_ratio=0.35, val_ratio=0.35, test_ratio=0.30)

train_hdfs_anomaly:, 4200, val_hdfs_anomaly:, 4200, test_hdfs_anomaly:, 3600, train_hdfs_normal:, 4200, val_hdfs_normal:, 4200, test_hdfs_normal:, 3600, train: hdfs_anomaly    4200
hdfs_normal     4200
Name: label, dtype: int64
val: hdfs_anomaly    4200
hdfs_normal     4200
Name: label, dtype: int64
test: hdfs_anomaly    3600
hdfs_normal     3600
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(256, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(256, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(256, 64, 256), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 256
num classes, train_data.element_spec[1].shape:  (256,)
length of val_data: 32
length of train_data - (num_seq_p

(<keras.engine.functional.Functional at 0x18274f67ee0>,
 <keras.callbacks.History at 0x181a867fb80>)

In [5]:
# we can check 64_256 with 4000 train and 4000 test
hlog_meta_64_256_time_ip = HDFSLogv3( debug=True )
res_64_256_time_ip = hlog_meta_64_256_time_ip.get_tensor_train_val_test(padded_seq_len=64, padded_char_len=256, 
                                                                        hdfs_rm_time_stamp=False, hdfs_rm_ip_address=False, 
                                                                        train_ratio=0.8, )
hlog_meta_64_256_time_ip.save_hdfs_log_obj()

test_model(ablation=12000, batch_size=256, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_64_256_time_ip.pkl', train_ratio=0.35, val_ratio=0.35, test_ratio=0.30)

total number of lines in the log file: 11175629
RAM usage:  100232088
loaded logs in memory in time: 4.986243724822998
loaded cleaned logs with blk_id  in memory: 202.5606620311737
RAM usage:  100232088
loaded cleaned logs without blkid in memory: 0.6442639827728271
RAM usage:  100232088
starting training the tokenizer:
ending tokenizer training: 109.69133853912354
RAM usage:  48
vocabulary size: 42
starting text to number conversion
completed:  0
time : 0.0
completed:  1000000
time : 17.371644973754883
completed:  2000000
time : 34.09197378158569
completed:  3000000
time : 51.08361482620239
completed:  4000000
time : 68.5211124420166
completed:  5000000
time : 85.38474225997925
completed:  6000000
time : 102.4377372264862
completed:  7000000
time : 121.05865240097046
completed:  8000000
time : 138.07822132110596
completed:  9000000
time : 155.37119960784912
completed:  10000000
time : 173.60666227340698
completed:  11000000
time : 192.4757251739502
ending text to number conversion: 19

(<keras.engine.functional.Functional at 0x18b619743a0>,
 <keras.callbacks.History at 0x181fa63fd30>)

In [6]:
##### with 1 K data ##################
test_model(ablation=1000, batch_size=256, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_64_time_ip.pkl', train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)

train_hdfs_anomaly:, 800, val_hdfs_anomaly:, 100, test_hdfs_anomaly:, 100, train_hdfs_normal:, 800, val_hdfs_normal:, 100, test_hdfs_normal:, 100, train: hdfs_anomaly    800
hdfs_normal     800
Name: label, dtype: int64
val: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
test: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 64), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 64), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 64), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 64
num classes, train_data.element_spec[1].shape:  (256,)
length of val_data: 0
length of train_data - (num_seq_per_cls * num_clas

(<keras.engine.functional.Functional at 0x1875aae4d60>,
 <keras.callbacks.History at 0x1875aaca550>)

In [8]:
test_model(ablation=1000, batch_size=256, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_64_no_meta.pkl', train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)

train_hdfs_anomaly:, 800, val_hdfs_anomaly:, 100, test_hdfs_anomaly:, 100, train_hdfs_normal:, 800, val_hdfs_normal:, 100, test_hdfs_normal:, 100, train: hdfs_anomaly    800
hdfs_normal     800
Name: label, dtype: int64
val: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
test: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 64), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 64), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 64), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 64
num classes, train_data.element_spec[1].shape:  (256,)
length of val_data: 0
length of train_data - (num_seq_per_cls * num_clas

(<keras.engine.functional.Functional at 0x1875d7c5a30>,
 <keras.callbacks.History at 0x18753248250>)

In [9]:
test_model(ablation=1000, batch_size=256, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_176_time_ip.pkl', train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)

train_hdfs_anomaly:, 800, val_hdfs_anomaly:, 100, test_hdfs_anomaly:, 100, train_hdfs_normal:, 800, val_hdfs_normal:, 100, test_hdfs_normal:, 100, train: hdfs_anomaly    800
hdfs_normal     800
Name: label, dtype: int64
val: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
test: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 176
num classes, train_data.element_spec[1].shape:  (256,)
length of val_data: 0
length of train_data - (num_seq_per_cls * num_

(<keras.engine.functional.Functional at 0x1875db49310>,
 <keras.callbacks.History at 0x1875db57670>)

In [10]:
test_model(ablation=1000, batch_size=256, kernel_size=3, epochs=16, dense_neurons=2048, conv1d_set1=3,conv1d_set2=3, maxpool_1=True,
          hdfs_obJ_full_name='data\\hdfsobj_32_176_no_meta.pkl', train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)

train_hdfs_anomaly:, 800, val_hdfs_anomaly:, 100, test_hdfs_anomaly:, 100, train_hdfs_normal:, 800, val_hdfs_normal:, 100, test_hdfs_normal:, 100, train: hdfs_anomaly    800
hdfs_normal     800
Name: label, dtype: int64
val: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
test: hdfs_anomaly    100
hdfs_normal     100
Name: label, dtype: int64
train_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
val_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
test_data <BatchDataset element_spec=(TensorSpec(shape=(256, 32, 176), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>
char in lines, train_data.element_spec[0].shape[2] 176
num classes, train_data.element_spec[1].shape:  (256,)
length of val_data: 0
length of train_data - (num_seq_per_cls * num_

(<keras.engine.functional.Functional at 0x187503d3df0>,
 <keras.callbacks.History at 0x187503d7e50>)

In [13]:
import os
hdfs_obJ_full_name='data\\hdfsobj_32_176_time_ip.pkl'

In [15]:
p = os.path.abspath(hdfs_obJ_full_name)

In [16]:
os.path.abspath(p)

'C:\\Users\\Bhujay_ROG\\MyDev\\OCLog\\oclog\\hdfs\\data\\hdfsobj_32_176_time_ip.pkl'