In [1]:
#@title Import Libraries
import numpy as np
import json
import re
import tensorflow as tf
import pandas as pd
import io
import math
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 1500)

In [6]:
#@title Upload Log Files
from google.colab import files
uploaded = files.upload()


Saving Good.log to Good.log
Saving node124.log to node124.log
Saving node125.log to node125.log


In [0]:
#@title Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#@title Preprocess Raw Log Data
#Preprocess the raw log file data

def pre_process_features(logs):
  time_from_start = {}    #Time elapsed since node was booted
  time_in_ms = {}         #Time in milliseconds
  text = {}               #Actual Log text
  text_code = {}          #Log text encoded in numeric form
  
  i = 0
  
  last_was_tx = False
  last_was_rx = False
  
  exp = re.compile(r"\:")
  exp_match_hdlc = re.compile(r"hdlc")
  
  #In the following code we are extracting the relevant parts of the log file data and storing them in a pandas series
  
  for line in logs:
    
    m = exp_match_hdlc.search(line[72 : 90])
    
    #We then encode the log text in a numeric format
    
    if m:
      
      time_from_start[i] = line[52 : 63]
      time_in_ms[i] = (int(time_from_start[i][8 :]) * 1) + (int(time_from_start[i][5 : 7]) * 1000) + (int(time_from_start[i][2 : 4]) * 60000) + (int(time_from_start[i][: 1]) * 3600000)
      
      text[i] = (exp.split(line[63:], maxsplit = 1)[-1]).strip()#[:-1]
      
      if (text[i] == "** Waiting for HDLC event (60000ms) **"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 0
        
      elif(text[i] == "disconnectLink 0x0"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 1
        
      elif(text[i] == "-- HDLC locked --"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 2
        
      elif (text[i] == "Connect HDLC link"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 3
        
      elif (text[i][: 2] == "tx"):
        if (last_was_tx):
          i = i - 1
        
        text_code[i] = 4
        text[i] = "[tx block]"
        last_was_tx = True
        
      elif (text[i][: 2] == "rx"):
        if (last_was_rx):
          i = i - 1
        
        text_code[i] = 5
        text[i] = "[rx block]"
        last_was_rx = True
        
      elif (text[i] == "ACTION_WAIT_FOR_RESPONSE"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 6
        
      elif (text[i] == "HDLC_TYPE_UA"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 7
        
      elif (text[i] == "!! Result == ResultOK !!"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 8
        
      elif (text[i] == "Passing IFrame up the stack to the DLMS task."):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 9
        
      elif (text[i] == "-- unlocking HDLC --"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 10
        
      elif (text[i] == "Disconnect HDLC link"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 11
            
      elif (text[i] == "HDLC_TYPE_I"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 12
        
      else: #text is anything other than the above
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 13
      
      text_code[i] = text_code[i] * 1
      
      i = i + 1
  
  return pd.Series(time_from_start), pd.Series(time_in_ms), pd.Series(text), pd.Series(text_code)

In [0]:
def pre_process_features(logs):   #Preprocess the entire log file
  time_from_start = {}    #Time elapsed since node was booted
  time_in_ms = {}         #Time in milliseconds
  text = {}               #Actual Log text
  text_code = {}          #Log text encoded in numeric form
  line_numbers = {}
  
  i = 0
  
  line_number = 0
  
  last_was_tx = False
  last_was_rx = False
  
  exp = re.compile(r"\:")
  exp_match_hdlc = re.compile(r"hdlc")
  
  #In the following code we are extracting the relevant parts of the log file data and storing them in a pandas series
  
  for line in logs:
    
    line_number = line_number + 1
    
    m = exp_match_hdlc.search(line[72 : 90])
    
    #We then encode the log text in a numeric format
    
    if m:
      
      line_numbers[i] = line_number
      
      time_from_start[i] = line[52 : 63]
      time_in_ms[i] = (int(time_from_start[i][8 :]) * 1) + (int(time_from_start[i][5 : 7]) * 1000) + (int(time_from_start[i][2 : 4]) * 60000) + (int(time_from_start[i][: 1]) * 3600000)
      
      text[i] = (exp.split(line[63:], maxsplit = 1)[-1]).strip()#[:-1]
      
      if (text[i] == "** Waiting for HDLC event (60000ms) **"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 0
        
      elif(text[i] == "disconnectLink 0x0"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 1
        
      elif(text[i] == "-- HDLC locked --"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 2
        
      elif (text[i] == "Connect HDLC link"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 3
        
      elif (text[i][: 2] == "tx"):
        if (last_was_tx):
          i = i - 1
        
        text_code[i] = 4
        text[i] = "[tx block]"
        last_was_tx = True
        
      elif (text[i][: 2] == "rx"):
        if (last_was_rx):
          i = i - 1
        
        text_code[i] = 5
        text[i] = "[rx block]"
        last_was_rx = True
        
      elif (text[i] == "ACTION_WAIT_FOR_RESPONSE"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 6
        
      elif (text[i] == "HDLC_TYPE_UA"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 7
        
      elif (text[i] == "!! Result == ResultOK !!"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 8
        
      elif (text[i] == "Passing IFrame up the stack to the DLMS task."):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 9
        
      elif (text[i] == "-- unlocking HDLC --"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 10
        
      elif (text[i] == "Disconnect HDLC link"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 11
            
      elif (text[i] == "HDLC_TYPE_I"):
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 12
        
      else: #text is anything other than the above
        last_was_tx = False
        last_was_rx = False
        text_code[i] = 13
      
      text_code[i] = text_code[i] * 1
      
      i = i + 1
  
  return pd.Series(line_numbers), pd.Series(time_from_start), pd.Series(time_in_ms), pd.Series(text), pd.Series(text_code)

def create_batches(df, windows, input, output):
  ## Create X         
  x_data = df[:size_train-1] # Select the data
  if((x_data.size % windows != 0)):
     x_data = x_data[: x_data.size - (x_data.size % windows)]
  X_batches = x_data.reshape(-1, windows, input)  # Reshape the data 
  ## Create y
  y_data = df[n_output:size_train]
  if((y_data.size % windows != 0)):
     y_data = y_data[: y_data.size - (y_data.size % windows)]
  y_batches = y_data.reshape(-1, windows, output)
  
  return X_batches, y_batches

def create_series_array_dict(arr):
  
  d = {}
  
  arr = arr[:arr.size - arr.size % (test_windows + 1)]
  
  for i in range(0, arr.size - (test_windows + 1)):
    d[i] = arr[i : (i + (test_windows + 1))]
  
  return d

def create_test_batches (d):
  d_x = {}
  d_y = {}
  
  for i in range(len(d)):
    d_x[i], d_y[i] = create_batches(df = d[i], windows = test_windows,input = 1, output = 1)
  
  return d_x, d_y

def log_alert(line_number):
  print("\nPassed threshold at around line", line_number)
  return

In [0]:
#@title Save the preprocessed data in a Pandas DataFrame
#This part is saving the preprocessed log data into pandas dataframes

path1='node124.log'
path2='node125.log'
path_good = 'Good.log'

log_data1=open(path1,'r')
log_data2=open(path2,'r')
log_data_good = open(path_good, 'r')

line_numbers, time_from_start, time_in_ms, text, text_code = pre_process_features(log_data1)

df1 = pd.DataFrame({'line_numbers': line_numbers, 'time_from_start': time_from_start, 'time_in_ms': time_in_ms, 'text': text, 'text_code': text_code})

line_numbers, time_from_start, time_in_ms, text, text_code = pre_process_features(log_data2)

df2 = pd.DataFrame({'line_numbers': line_numbers, 'time_from_start': time_from_start, 'time_in_ms': time_in_ms, 'text': text, 'text_code': text_code})

line_numbers, time_from_start, time_in_ms, text, text_code = pre_process_features(log_data_good)

df_good = pd.DataFrame({'line_numbers': line_numbers, 'time_from_start': time_from_start, 'time_in_ms': time_in_ms, 'text': text, 'text_code': text_code})

In [8]:
path_good = 'Good.log'
log_data_good = open(path_good, 'r')

line_numbers, time_from_start, time_in_ms, text, text_code = pre_process_features(log_data_good)
df_good = pd.DataFrame({'line_numbers': line_numbers, 'time_from_start': time_from_start, 'time_in_ms': time_in_ms, 'text': text, 'text_code': text_code})

series = np.array(df_good["text_code"])   #Convert the data to a numpy array

n_windows = 20       #number of data elements in a batch
n_input =  1         #for reshaping the data
n_output = 1         #for reshaping the data
size_train = int(len(series) * 4 / 5)

## Split data
train = series[:size_train]
test = series[size_train:]

#To reshape the data into batches

X_batches, y_batches = create_batches(df = train,
                                      windows = n_windows,
                                      input = n_input,
                                      output = n_output)

X_test, y_test = create_batches(df = test, windows = n_windows, input = n_input, output = n_output)

tf.reset_default_graph()    #Reset to the initial graph

r_neuron = 240              #The number of neurons
#num_layers = 10            #The number of hidden layers

#Construct the tensors
X = tf.placeholder(tf.float32, [None, n_windows, n_input])
y = tf.placeholder(tf.float32, [None, n_windows, n_output])
                                                               #These are placeholders which are simply variables to hold values

#Create the model

cell = tf.keras.layers.SimpleRNNCell(units=r_neuron, activation=tf.nn.relu) 
#cell = tf.nn.rnn_cell.LSTMCell(num_units = r_neuron, activation = tf.nn.relu)

#Creating the RNN LSTM cell itself

#cells = tf.keras.layers.StackedRNNCells(cell for _ in range(num_layers))      #Create a multi LSTM RNN

rnn_output, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

#Creates the RNN and returns an output

stacked_rnn_output = tf.reshape(rnn_output, [-1, r_neuron])         #Flattens the tensor to 1D
stacked_outputs = tf.layers.dense(stacked_rnn_output, n_output)
outputs = tf.reshape(stacked_outputs, [-1, n_windows, n_output])   

#Applying transformations to the output received into a usable form

## 3. Loss + optimization
learning_rate = 0.002     #The learning rate
 
#loss = tf.reduce_sum(tf.square(outputs - y)) 
loss = tf.losses.mean_squared_error(y, outputs)      #To calculate Loss on the training set
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)       #The optimizer used  
training_op = optimizer.minimize(loss)             #Create the Training operation

round_op = tf.round(y)    #To round values off
clip_op = tf.clip_by_value(t = y, clip_value_min = 0, clip_value_max = 13)     #To clip values off
loss_test = tf.losses.mean_squared_error(tf.round(y), tf.round(X))     #To calculate Loss on the test set

init = tf.global_variables_initializer()    #To Initialize and reset the model

saver = tf.train.Saver()    #To save the model

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
#@title Initialize Model and Weights
with tf.Session() as sess:
  init.run()
  save_path = saver.save(sess, "/tmp/model.ckpt")

In [11]:
#@title Train Existing Saved Model
epochs = 7500 #@param {type:"integer"}

with tf.Session() as sess:
  saver.restore(sess, "/tmp/model.ckpt")
  
  for epoch in range(epochs):
    sess.run(training_op, feed_dict={X: X_batches, y: y_batches})
    if epoch % 150 == 0:
      mse = loss.eval(feed_dict={X: X_batches, y: y_batches})
      print(epoch, "\tMSE:", mse)
  
  save_path = saver.save(sess, "/tmp/model.ckpt")
  
  y_pred = sess.run(outputs, feed_dict={X: X_test})
  y_pred = sess.run(round_op, feed_dict={y: y_pred})
  y_pred = sess.run(clip_op, feed_dict={y: y_pred})
  test_mse = loss_test.eval(feed_dict={X: y_pred, y: y_test})
  print("Test MSE:", test_mse)
  

INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
0 	MSE: 1.4040793
150 	MSE: 1.3731235
300 	MSE: 1.372521
450 	MSE: 1.3112212
600 	MSE: 1.2905693
750 	MSE: 1.2699399
900 	MSE: 1.2449094
1050 	MSE: 1.3117474
1200 	MSE: 1.2106551
1350 	MSE: 1.1941807
1500 	MSE: 1.1833115
1650 	MSE: 1.1672306
1800 	MSE: 1.1559066
1950 	MSE: 1.1442839
2100 	MSE: 1.1382657
2250 	MSE: 1.1324183
2400 	MSE: 1.1125761
2550 	MSE: 1.103614
2700 	MSE: 1.1221315
2850 	MSE: 1.0979841
3000 	MSE: 1.076002
3150 	MSE: 1.0723139
3300 	MSE: 1.0661075
3450 	MSE: 1.0509571
3600 	MSE: 1.0444288
3750 	MSE: 1.037027
3900 	MSE: 1.0332007
4050 	MSE: 1.0357717
4200 	MSE: 1.0535682
4350 	MSE: 1.0279149
4500 	MSE: 1.0197554
4650 	MSE: 1.0158005
4800 	MSE: 1.0045161
4950 	MSE: 1.0018698
5100 	MSE: 0.99926555
5250 	MSE: 1.0497575
5400 	MSE: 0.9977267
5550 	MSE: 0.9961215
5700 	MSE: 0.9953775
5850 	MSE: 0.9913754
6000 	MSE: 0.9876031
6150 	MSE: 0.9810082
6300 	MSE: 0.99627316
6450 	MSE: 0.97546136
6600 	MSE: 0.9751365
6750 	M

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [0]:
path='node125.log'
log_data=open(path,'r')

line_numbers, time_from_start, time_in_ms, text, text_code = pre_process_features(log_data)
df = pd.DataFrame({'line_numbers': line_numbers, 'time_from_start': time_from_start, 'time_in_ms': time_in_ms, 'text': text, 'text_code': text_code})

test_series = np.array(df["text_code"])

test_windows = 20

series_array_dict = create_series_array_dict(test_series)

n = len(series_array_dict)

test_batch_dict_x, test_batch_dict_y = create_test_batches(series_array_dict)

threshold = 25

with tf.Session() as sess:
  saver.restore(sess, "/tmp/model.ckpt")
  
  logs_stabilized = True
  
  last_passed_threshold_line = 0
  
  y_test_predictions_dict = {}
  
  for i in range(n):
    
    line_number = i + (test_windows + 1)
    
    y_test_predictions_dict[i] = sess.run(outputs, feed_dict={X: test_batch_dict_x[i]})
    y_test_predictions_dict[i] = sess.run(round_op, feed_dict={y: y_test_predictions_dict[i]})
    test_mse = loss_test.eval(feed_dict={X: y_test_predictions_dict[i], y: test_batch_dict_y[i]})
    
    if(test_mse < 10):
      logs_stabilized = True
    
    print("MSE:", test_mse, "\t\tat line\t", df["line_numbers"][i], "\tof the entire log file and at line\t", line_number, "\tof the HDLC logs")
    
    if ((test_mse > threshold) & (logs_stabilized)):
      logs_stabilized = False
      log_alert(df["line_numbers"][i])

MSE: 1.0 		at line	 236 	of the entire log file and at line	 21 	of the HDLC logs
MSE: 3.9 		at line	 239 	of the entire log file and at line	 22 	of the HDLC logs
MSE: 0.9 		at line	 288 	of the entire log file and at line	 23 	of the HDLC logs
MSE: 0.25 		at line	 289 	of the entire log file and at line	 24 	of the HDLC logs
MSE: 0.75 		at line	 291 	of the entire log file and at line	 25 	of the HDLC logs
MSE: 1.05 		at line	 292 	of the entire log file and at line	 26 	of the HDLC logs
MSE: 1.0 		at line	 303 	of the entire log file and at line	 27 	of the HDLC logs
MSE: 0.85 		at line	 304 	of the entire log file and at line	 28 	of the HDLC logs
MSE: 0.05 		at line	 306 	of the entire log file and at line	 29 	of the HDLC logs
MSE: 1.75 		at line	 307 	of the entire log file and at line	 30 	of the HDLC logs
MSE: 1.5 		at line	 318 	of the entire log file and at line	 31 	of the HDLC logs
MSE: 2.1 		at line	 319 	of the entire log file and at line	 32 	of the HDLC logs
MSE: 2.65 