In [18]:
import os
import sys
import re
import psutil
import time
import pickle 
import numpy as np
import pandas as pd
from collections import OrderedDict
from itertools  import groupby
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.utils import shuffle
import random
random.seed(123)

In [2]:
Logpath='C:\ML_data\Logs'
labelpath='C:\ML_data\Logs'
# print(os.listdir(Logpath))
logfilename='BGL.log'
bglfile = os.path.join(Logpath, logfilename)
print(os.path.exists(bglfile))
with open(bglfile, 'r',  encoding='utf8') as f:
    bglraw = f.readlines()
print(f'total loglines: {len(bglraw)}')

True
total loglines: 4747963


In [3]:
# get all the alerts in the list. total number of alerts should match with the 
#  total number of loglines
alerts =  [l.split()[8] for l in bglraw]
print('alerts',len(alerts))
unique_alerts = set(alerts)
print(f'unique_alerts: {unique_alerts}')
negative_alerts = ['FATAL', 'SEVERE', 'WARNING', 'Kill', 'FAILURE', 'ERROR']
sequence_len = 32
sequences = [bglraw[i * sequence_len:(i + 1) * sequence_len] for i in range((len(bglraw)) // sequence_len )] 
print(len(sequences))

stime = time.time()
labelled_sequences = []
for seq in sequences:    
    label = 'INFO'
    for s in seq:
        if s.split()[8] in negative_alerts:
            label = s.split()[8]
    labelled_sequences.append((seq, label))          
etime = time.time()
print(f'elapsed time: {etime - stime}')
df = pd.DataFrame(labelled_sequences, columns=['sequence', 'label'])
print(df.label.value_counts())

alerts 4747963
148373
elapsed time: 3.3453094959259033
INFO       114115
FATAL       28439
ERROR        4049
SEVERE        628
Kill          165
FAILURE        75
Name: label, dtype: int64


In [4]:
def clean_bgl(txt_line, clean_part_1=True, clean_part_2=True, clean_time_1=True, clean_part_4=True, clean_time_2=True, clean_part_6=True):
    part_1 = ''
    part_2 = ''
    time_1 = ''
    part_4 = ''
    time_2 = ''
    part_6 = ''
    if clean_part_1:
        part_1 = '^-\s|^\w+\s'
    if clean_part_2:
        part_2 = '\d{10}\s'
    if clean_time_1:
        time_1 = '\d{4}\.\d{2}\.\d{2}\s'
    if clean_part_4:
        part_4 = '\w\d{2}-\w\d-\w{2}-\w:\w\d{2}-\w\d{2}\s'
    if clean_time_2:
        time_2 = '\d{4}-\d{2}-\d{2}-\d{2}\.\d{2}\.\d{2}\.\d{6}\s'
    if clean_part_6:
        part_6 = 'RAS'
    part_7 = '[\n]'
    signs_n_punctuations = '\]|\[|\)|\(|\=|\,|\;|\/|\{|\}[$]|[@]|[#]|[%]|[_]|[*]|[&]|[ï]|[ã]|[`]|[ð]|[-]|[\x0f]|[\x00]|[\x10]|[\x98]|[ç]|[:]|\''
#     signs_n_punctuations = '\]|\[|\)|\(|\=|\,|\;|\/|\{|\}\$\@\#\%\_\*\&\|ï|ã|`|ð|\-'
#     signs_n_punctuations = '\]|\[|\)|\(|\=|\,|\;|\/|\{|\}[$]|[@]|[#]|[%]|[_]|[*]|[&]|[ï]|[ã]|`|ð|[-]|[\x0f]|[\x00]|[\x10]|[\x98]|[ç]|[:]'
    white_space = '\s'
    multiple_dots = '\.+?'
    pat =f'{part_1}|{part_2}|{time_1}|{part_4}|{time_2}|{part_6}\s|{part_7}|{signs_n_punctuations}|{white_space}|{multiple_dots}'
#     signs_n_punctuations = '\]|\[|\)|\(|\=|\,|\;|\/'
#     pat =f'{part_1}|{part_2}|{time_1}|{part_4}|{time_2}|{part_6}\s|{part_7}|{signs_n_punctuations}'
    s = re.sub(pat, '', txt_line)
    return s

In [5]:
cleaned_labelled_sequences = []
for sequence, label in labelled_sequences:
    cleaned_seq = []
    for line in sequence:
        cleaned_line = clean_bgl(line)
        cleaned_line = cleaned_line.lower()
        cleaned_seq.append(cleaned_line)
    cleaned_labelled_sequences.append((cleaned_seq, label)) 
# cleaned_labelled_sequences[0]

In [6]:
whole_text_for_training = [line for sequence, _ in cleaned_labelled_sequences for line in sequence]
len(whole_text_for_training)
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(whole_text_for_training)
print('character vocabulary', len(tk.word_index))
# print(tk.word_index)

character vocabulary 50


In [7]:
padded_char_len=64
padding_style='post'
truncating='pre'
num_sequences = []
for seq, label in cleaned_labelled_sequences:
    num_seq = []
    for sline in seq:
        try:        
            num_line = tk.texts_to_sequences([sline])
            padded_num_line = pad_sequences(num_line, maxlen=padded_char_len, 
                                                  padding=padding_style, truncating=truncating)
            num_seq.append(padded_num_line[0])
        except Exception as e:
            print(e)
            print('line:', sline)   
            break
    num_sequences.append((num_seq, label)) 
# num_sequences[0]
print(len(num_sequences))

In [10]:
numdf = pd.DataFrame(num_sequences, columns=['seq', 'label'])
# numdf.head()
numdf["label"].replace({"INFO": "0", "FATAL": "1", "ERROR": "2", 
                     "WARNING": "3", "SEVERE": "4", "Kill": "5",
                    "FAILURE": "6"}, inplace=True)
print(numdf.label.value_counts())

0    114115
1     28439
2      4049
3       902
4       628
5       165
6        75
Name: label, dtype: int64


In [14]:
def train_test_split_class(bgldf, ablation = 4000, train_test_ratio = 0.8, label='NORMALBGL'):
    train_data=None
    test_data=None
    train_cnt = round(ablation * train_test_ratio)
    test_cnt = round(ablation * (1 - train_test_ratio))

    if train_cnt <= bgldf[bgldf.label==label].count()[0] :
      train_data = bgldf[bgldf.label==label][0:train_cnt]
    else:
        print(f'{label} class does not have {train_cnt} records, it has only {bgldf[bgldf.label==label].count()[0]} records')
    if test_cnt <= bgldf[bgldf.label==label].count()[0] :
      test_data = bgldf[bgldf.label==label][train_cnt:ablation]
    else:
        print(f'{label} class does not have {test_cnt} records, it has only {bgldf[bgldf.label==label].count()[0]} records')
    if train_data is not None:
        print(f'train_{label}:, {train_data.count()[0]}')
    if test_data is not None:
        print(f'test_{label}:, {test_data.count()[0]}')
    return train_data, test_data


# classes = ['NORMALBGL', 'FATALBGL', 'ERRORBGL', 'WARNINGBGL','SEVEREBGL', 'KillBGL', 'FAILUREBGL', ] 
classes = ['0', '1', '2', '3','4', '5', '6', ] 
def train_test_multi_class(bgldf, ablation=4000, train_test_ratio=0.8, classes=classes):
    train_data = []
    test_data = []
    for class_name in classes:
            trdata, tsdata = train_test_split_class(bgldf, ablation=ablation, 
                                                    train_test_ratio=train_test_ratio, 
                                                    label=class_name)
            if trdata is not None: train_data.append(trdata)
            if tsdata is not None: test_data.append(tsdata)
    
    train_df = pd.concat(train_data)
    test_df = pd.concat(test_data)
    return train_df, test_df    

In [13]:
train_df, test_df = train_test_multi_class(numdf, ablation=100)
train_df.label.value_counts()

train_0:, 80
test_0:, 20
train_1:, 80
test_1:, 20
train_2:, 80
test_2:, 20
train_3:, 80
test_3:, 20
train_4:, 80
test_4:, 20
train_5:, 80
test_5:, 20
6 class does not have 80 records, it has only 75 records
test_6:, 0


0    80
1    80
2    80
3    80
4    80
5    80
Name: label, dtype: int64

In [16]:
x_train = list(train_df.seq.values)
y_train = list(train_df.label.values)
y_train = to_categorical(y_train)
print(y_train[:2])
x_test = list(test_df.seq.values)
y_test = list(test_df.label.values)
y_test = to_categorical(y_test)
print(y_test[:2])
print(y_train[80:82])

[[1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]]
[[0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]]


In [17]:
vocab_size = len(tk.word_index)
vocab_size = len(tk.word_index)
print(f'vocab_size: {vocab_size}')
char_onehot = vocab_size

vocab_size: 50


In [20]:
B=32
# train_data = tf.data.Dataset.from_tensor_slices(x_train)
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.shuffle(buffer_size=y_train.shape[0]).batch(B, drop_remainder=True)
print(train_data)
    
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_data = test_data.shuffle(buffer_size=y_test.shape[0]).batch(B, drop_remainder=True)
print(test_data)

print(train_data.element_spec[0].shape[2])
print(train_data.element_spec[1].shape[1])

<BatchDataset shapes: ((32, 32, 64), (32, 6)), types: (tf.int32, tf.float32)>
<BatchDataset shapes: ((32, 32, 64), (32, 6)), types: (tf.int32, tf.float32)>
64
6


In [21]:
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items(): # from 1 to 51
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)
    
input_size =[ train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]]
embedding_size = vocab_size

embedding_layer = tf.keras.layers.Embedding(vocab_size+1,
                                                embedding_size,
                                                input_length=input_size,
                                                weights = [embedding_weights])

In [22]:
conv1d_set1 = 3
conv1d_set2 = 3
dense_neurons=2048
filters=64
kernel_size=3
maxpool_1=True
epochs=25

inputs = tf.keras.layers.Input(batch_shape=(B, train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]), dtype='float64' )
x = tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                output_dim=embedding_size,
                                input_length=train_data.element_spec[0].shape[2],
                                weights = [embedding_weights],
                                )(inputs)
for _ in range(conv1d_set1):
    x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
if maxpool_1:
    x = tf.keras.layers.MaxPooling2D(pool_size=(1, train_data.element_spec[0].shape[2]))(x)
    x = tf.reshape(x, (B, train_data.element_spec[0].shape[1], filters))        
    for _ in range(conv1d_set2):
        x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=(train_data.element_spec[0].shape[1]) )(x)
    x = tf.reshape(x, (B, filters))
if not maxpool_1:
    x = tf.keras.layers.Flatten()(x)       
x = tf.keras.layers.Dense(dense_neurons)(x)
outputs = tf.keras.layers.Dense(train_data.element_spec[1].shape[1], activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
print(model.summary())
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
          metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
hist = model.fit(train_data, validation_data=test_data, epochs=epochs) 

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(32, 32, 64)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (32, 32, 64, 50)          2550      
_________________________________________________________________
conv1d (Conv1D)              (32, 32, 64, 64)          9664      
_________________________________________________________________
conv1d_1 (Conv1D)            (32, 32, 64, 64)          12352     
_________________________________________________________________
conv1d_2 (Conv1D)            (32, 32, 64, 64)          12352     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (32, 32, 1, 64)           0         
_________________________________________________________________
tf_op_layer_Reshape (TensorF [(32, 32, 64)]           

In [23]:
train_df, test_df = train_test_multi_class(numdf, ablation=1000)
train_df.label.value_counts()

train_0:, 800
test_0:, 200
train_1:, 800
test_1:, 200
train_2:, 800
test_2:, 200
train_3:, 800
test_3:, 102
4 class does not have 800 records, it has only 628 records
test_4:, 0
5 class does not have 800 records, it has only 165 records
5 class does not have 200 records, it has only 165 records
6 class does not have 800 records, it has only 75 records
6 class does not have 200 records, it has only 75 records


0    800
1    800
2    800
3    800
Name: label, dtype: int64

In [24]:
x_train = list(train_df.seq.values)
y_train = list(train_df.label.values)
y_train = to_categorical(y_train)
print(y_train[:2])
x_test = list(test_df.seq.values)
y_test = list(test_df.label.values)
y_test = to_categorical(y_test)
print(y_test[:2])

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]]


In [25]:
B=32
# train_data = tf.data.Dataset.from_tensor_slices(x_train)
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.shuffle(buffer_size=y_train.shape[0]).batch(B, drop_remainder=True)
print(train_data)
    
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_data = test_data.shuffle(buffer_size=y_test.shape[0]).batch(B, drop_remainder=True)
print(test_data)

<BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>
<BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>


In [26]:
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items(): # from 1 to 51
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)
    
input_size =[ train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]]
embedding_size = vocab_size

embedding_layer = tf.keras.layers.Embedding(vocab_size+1,
                                                embedding_size,
                                                input_length=input_size,
                                                weights = [embedding_weights])

In [27]:
conv1d_set1 = 3
conv1d_set2 = 3
dense_neurons=2048
filters=64
kernel_size=3
maxpool_1=True
epochs=25

inputs = tf.keras.layers.Input(batch_shape=(B, train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]), dtype='float64' )
x = tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                output_dim=embedding_size,
                                input_length=train_data.element_spec[0].shape[2],
                                weights = [embedding_weights],
                                )(inputs)
for _ in range(conv1d_set1):
    x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
if maxpool_1:
    x = tf.keras.layers.MaxPooling2D(pool_size=(1, train_data.element_spec[0].shape[2]))(x)
    x = tf.reshape(x, (B, train_data.element_spec[0].shape[1], filters))        
    for _ in range(conv1d_set2):
        x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=(train_data.element_spec[0].shape[1]) )(x)
    x = tf.reshape(x, (B, filters))
if not maxpool_1:
    x = tf.keras.layers.Flatten()(x)       
x = tf.keras.layers.Dense(dense_neurons)(x)
outputs = tf.keras.layers.Dense(train_data.element_spec[1].shape[1], activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
print(model.summary())
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
          metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
hist = model.fit(train_data, validation_data=test_data, epochs=epochs) 

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(32, 32, 64)]            0         
_________________________________________________________________
embedding_3 (Embedding)      (32, 32, 64, 50)          2550      
_________________________________________________________________
conv1d_6 (Conv1D)            (32, 32, 64, 64)          9664      
_________________________________________________________________
conv1d_7 (Conv1D)            (32, 32, 64, 64)          12352     
_________________________________________________________________
conv1d_8 (Conv1D)            (32, 32, 64, 64)          12352     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (32, 32, 1, 64)           0         
_________________________________________________________________
tf_op_layer_Reshape_2 (Tenso [(32, 32, 64)]           

In [29]:
os.path.exists(Logpath)

True