In [112]:
import random
from hdfslogs import HDFSLog
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences

In [2]:
# creating an instance of the hdfslogs, we will have to inspect the sequence and the loglines later to 
# tune the values of sequence length and the characters in line. 
hlogs = HDFSLog(train_ratio=0.9, padded_char_len=128, padded_seq_len=32,)

In [3]:
# get_train_test_data_text is to see the texts in sequence , for model training get_train_test_data_num will be used
# the ablation parameter allows to take only 10 number each from positive and negative sequences
x_train, y_train, x_test, y_test = hlogs.get_train_test_data_text(ablation=10)

total number of lines in the log file: 11175629
RAM usage:  91719104
starting training the tokenizer:
ending tokenizer training: 230.0534029006958
vocabulary size: 51
RAM usage:  48
vocabulary size: 51
completed:  0
ending blk sequencing: 0.0
completed:  1000000
ending blk sequencing: 2.0652763843536377
completed:  2000000
ending blk sequencing: 4.019099473953247
completed:  3000000
ending blk sequencing: 5.840413808822632
completed:  4000000
ending blk sequencing: 7.860861301422119
completed:  5000000
ending blk sequencing: 9.660545110702515
completed:  6000000
ending blk sequencing: 11.43095850944519
completed:  7000000
ending blk sequencing: 13.614781141281128
completed:  8000000
ending blk sequencing: 15.432470321655273
completed:  9000000
ending blk sequencing: 17.181137800216675
completed:  10000000
ending blk sequencing: 19.0540292263031
completed:  11000000
ending blk sequencing: 21.314929008483887
RAM usage:  184627341
RAM usage:  16
getting ablation data: 10
10 16828
free ram

In [4]:
# running get_train_test_data_text will provide us seq_of_log_texts
# we can load it in a dataframe to observe how a positve and negative sequence looks line
df = hlogs.seq_of_log_texts
df

Unnamed: 0,BlockId,LogSequence,Label
0,blk_-1608999687919862906,[081109 203518 143 info dfs.datanode$dataxceiv...,0
1,blk_7503483334202473044,[081109 203520 142 info dfs.datanode$dataxceiv...,0
2,blk_-3544583377289625738,[081109 203521 145 info dfs.datanode$dataxceiv...,1
3,blk_-9073992586687739851,[081109 203523 143 info dfs.datanode$dataxceiv...,0
4,blk_7854771516489510256,[081109 203529 148 info dfs.datanode$dataxceiv...,0
...,...,...,...
575056,blk_1019720114020043203,[081111 110351 27174 info dfs.datanode$dataxce...,0
575057,blk_-2683116845478050414,[081111 110359 26685 info dfs.datanode$dataxce...,0
575058,blk_5595059397348477632,[081111 110402 27311 info dfs.datanode$dataxce...,0
575059,blk_1513937873877967730,[081111 110412 27231 info dfs.datanode$dataxce...,0


In [6]:
# lets filter only those sequences which are anomalous
dfpos=None
dfpos= df[df.Label == 1]
# filterted rows will have the original index , for ease of slicing we reset the index staring fron 0
dfpos = dfpos.reset_index()
# slice a single sequence
single_sequence = dfpos.iloc[0:1, 2:]['LogSequence'].to_list()
# observe its length
len(single_sequence[0])
# take all the values in a list and see the total number
pos_seqs = dfpos.LogSequence.values
print('total positive sequences', len(pos_seqs))

In [61]:
# see the average, max length of the positive sequence also see how many sequences are having length higher than the average or a cutoff value we decide
from  statistics import mean, mode
seq_len = [len(seq) for seq in  pos_seqs]
print('average len of seq: ',mean(seq_len))
print('most frequent len of seq: ', mode(seq_len))
print('max len of seq: ', max(seq_len))
# print(seq_len)
from collections import Counter
cnt = Counter(seq_len)
print('seq len distribution: ', cnt)
def check_losses(seqlen):
    num_of_datapoints = []
    for k,v in cnt.items():
        if k > seqlen:
            num_of_datapoints.append(v)
    print(f'{sum(num_of_datapoints)} number of seq whose len is higer than {seqlen} ', )
# put a cutoff value as the param to the function to check how many sequences will be stripped off if we decide to keep the max of the sequence 
# equal to the cutoff value
check_losses(32)

average len of seq:  17.119016510274378
most frequent len of seq:  20
max len of seq:  284
seq len distribution:  Counter({20: 3717, 4: 3225, 2: 2950, 27: 1036, 26: 755, 21: 616, 29: 612, 22: 408, 28: 361, 41: 358, 30: 349, 24: 341, 23: 289, 25: 245, 33: 228, 32: 198, 42: 176, 31: 171, 38: 156, 14: 133, 39: 87, 16: 87, 34: 65, 40: 42, 43: 42, 37: 35, 36: 32, 35: 18, 44: 16, 17: 12, 19: 11, 15: 11, 8: 10, 13: 10, 45: 7, 3: 6, 46: 4, 48: 2, 53: 2, 12: 2, 223: 1, 61: 1, 51: 1, 50: 1, 52: 1, 54: 1, 56: 1, 55: 1, 230: 1, 278: 1, 49: 1, 57: 1, 284: 1})
1283 number of seq whose len is higer than 32 


In [None]:
# we observe that with 64 as the max len , only 4 numbers of sequences will be truncated
# with 40- 50 it is going to be 800 to 21 
# with 32 we  around 1238 number of sequences will be truncated
# So if there is a sequence which has 64 lines, as per the 'pre' truncate configuration initial 32 lines will be truncated
# where as if there is a sequence which is having 16 lines, last 16 lines will be filled with zero
# if we decide to take 32 as the max lentgh , we have not estimated how many require padding but 1238 total lines will be truncated 
# .... many sequences will be impacted by the truncating

In [73]:
# how a positive sequence look like? what makes it anomalous? 
pseq = pos_seqs[1]
print('len of seqence', len(pseq))
for i,l in enumerate(pseq):
    print(f'{i}:  {l}')

len of seqence 40
0:  081109 203530 149 info dfs.datanode$dataxceiver: receiving block blk_-8531310335568756456 src: /10.251.203.149:53912 dest: /10.251.203.149:50010
1:  081109 203530 28 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand/_temporary/_task_200811092030_0001_m_000007_0/part-00007. blk_-8531310335568756456
2:  081109 203532 146 info dfs.datanode$dataxceiver: receiving block blk_-8531310335568756456 src: /10.251.106.10:36502 dest: /10.251.106.10:50010
3:  081109 203532 151 info dfs.datanode$dataxceiver: receiving block blk_-8531310335568756456 src: /10.251.203.149:59042 dest: /10.251.203.149:50010
4:  081109 203628 150 info dfs.datanode$packetresponder: packetresponder 0 for block blk_-8531310335568756456 terminating
5:  081109 203628 150 info dfs.datanode$packetresponder: received block blk_-8531310335568756456 of size 67108864 from /10.251.106.10
6:  081109 203628 152 info dfs.datanode$packetresponder: packetresponder 2 for block blk_-853131033556875

In [337]:
# we can observe that following parts of the texts might be the reason for this sequence being anomaly
#type_1_line: warn dfs.datanode$dataxceiver ..... got exception while serving blk
type_1_line = '081109 213852 2573 warn dfs.datanode$dataxceiver: 10.251.203.149:50010:got exception while serving blk_-8531310335568756456 to /10.251.39.144:'
print(f'len of the type_1_line: : {len(type_1_line)}')
# type_2_line: info dfs.fsnamesystem: block* namesystem.addstoredblock: addstoredblock request received for blk_-8531310335568756456 on 10.251.106.10:50010 size 67108864 but it does not belong to any file
type_2_line = '081110 104628 27 info dfs.fsnamesystem: block* namesystem.addstoredblock: addstoredblock request received for blk_-8531310335568756456 on 10.251.106.10:50010 size 67108864 but it does not belong to any file.'
print(f'len of the type_2_line: : {len(type_2_line)}')
# first line severity level is warn line toward the end having info severity
#############################################
#On the otherhand the initial lines in the sequence are actually indicating sucess and normal bevaviour
# verification succeeded for blk_-8531310335568756456
# These lines are at the begining of the sequences , upto 16th line they are normal.
# Indicates that in a sequence of 40 lines, even if set maxlen = 32 , initial 8 lines will be truncated, retaining 8 normal and rest of the abnormal lines. 
################################
# We can't pad the log text directly, the pad_sequence method does not seem to work on raw text as shown in the value error statement
# pad_sequences(pseq, maxlen=32, truncating='pre', padding='post')
# ValueError: invalid literal for int() with base 10: '47038 dest: /10.251.71.193:50010'
# so we converted each characters of  both the type_1_line and type_2_line to number: 
# type_1_line = pseq[16:17]
# print('type_1_line: ', type_1_line)
tknum = tk.texts_to_sequences([type_1_line])
print('type_1_line line after num conversion: ', tknum)
print('len of type_1_line  after num conversion', len(tknum))
# type_2_line = pseq[-3:-2]
# print('type_2_line: ', type_2_line)
tknum = tk.texts_to_sequences([type_2_line])
print('type_2_line after num conversion: ', tknum)
print('len of type_2_line after num conversion', len(tknum))

len of the type_1_line: : 142
len of the type_2_line: : 207
type_1_line line after num conversion:  [[4, 12, 3, 3, 4, 21, 2, 7, 3, 16, 12, 10, 7, 2, 7, 10, 19, 16, 2, 43, 11, 24, 17, 2, 6, 22, 8, 14, 6, 11, 13, 11, 17, 9, 6, 5, 36, 6, 11, 13, 11, 41, 26, 5, 20, 33, 5, 24, 28, 2, 3, 4, 14, 7, 10, 3, 14, 7, 4, 16, 14, 3, 15, 21, 28, 10, 4, 4, 3, 4, 28, 37, 9, 13, 2, 5, 41, 26, 5, 31, 13, 20, 9, 17, 2, 43, 42, 20, 23, 5, 2, 8, 5, 24, 33, 20, 17, 37, 2, 27, 23, 25, 32, 35, 12, 10, 16, 3, 16, 3, 4, 16, 16, 10, 10, 18, 12, 19, 10, 18, 15, 10, 18, 2, 13, 9, 2, 30, 3, 4, 14, 7, 10, 3, 14, 16, 21, 14, 3, 15, 15, 28]]
len of type_1_line  after num conversion 1
type_2_line after num conversion:  [[4, 12, 3, 3, 3, 4, 2, 3, 4, 15, 18, 7, 12, 2, 7, 19, 2, 20, 17, 22, 9, 2, 6, 22, 8, 14, 22, 8, 17, 11, 29, 5, 8, 34, 8, 13, 5, 29, 28, 2, 27, 23, 9, 26, 25, 39, 2, 17, 11, 29, 5, 8, 34, 8, 13, 5, 29, 14, 11, 6, 6, 8, 13, 9, 24, 5, 6, 27, 23, 9, 26, 25, 28, 2, 11, 6, 6, 8, 13, 9, 24, 5, 6, 27, 23, 9, 26,

In [None]:
# type_1_line which contains more negative words e.g.,  'warn', and  'exception' is shorter length, 142 characters compared to the type_2_line whose length is 207
# actually for this type_1_line, only the severity and the message part is only sufficient numbers, meta information e.g., time stamp and other numercial information is not playing nay role
# The timestamp and blk_id, source(dfs.datanode$dataxceiver) and the ip address are potentially not carrying any valuable information that can indicate anomaly. We will have to reflect on this. We have a potential to save around 50-60 chacters. 
# for type_2_line,  the possible anomalous part is " does not belong to any file". This appears at the end of the line.
# This indicates if we have to adopt a truncating strategy, it is better to truncate the first part of the message which carries time, source and severtiy. 
# The valualbe part we loose is severity.However for type 2, severity='info' does not indicate anomaly.  for type1, severity='warn' will be retained as long as the maxlen is around 140
# checking another sequence just to ensure repeatable pattern for type_1_line and  type_2_line in an anomalous sequnce of lines

In [124]:
def inspect_sequence_by_seq_len(slen, print_indexes=False):
    seq_len_40_indexes = [i for i,v in enumerate(pos_seqs) if len(v)==slen]
    if print_indexes:
        print('seq_len_40_indexes: ', seq_len_40_indexes)
    print('number of indexes: ', len(seq_len_40_indexes))
    randomly_selected_index = random.choice(seq_len_40_indexes)
    print('chosen random index: ', randomly_selected_index)
    pseq = pos_seqs[randomly_selected_index]
    print('len of seqence', len(pseq))
    for i,l in enumerate(pseq):
        print(f'{i}:  {l}')        

In [115]:
# let us check  when the sequence len = 40, whether type_1_line and type_2_line pattern appears. 
inspect_sequence_by_seq_len(40)
# checking another sequence just to ensure repeatable pattern for type_1_line and  type_2_line in an anomalous sequnce of lines
# refresh this cell, it will randomly show one sequence from the list of 42 sequence, all of these 42 sequences  have 40 lines

seq_len_40_indexes:  [1, 2, 5, 7, 6201, 6206, 6209, 6381, 6394, 6398, 6399, 6403, 6404, 6417, 6426, 6428, 6458, 6481, 6513, 6636, 6673, 6734, 6739, 6747, 6755, 6862, 6945, 6969, 6971, 6981, 6984, 7062, 7072, 7248, 7273, 7325, 7327, 7398, 7446, 7475, 7582, 13751]
number of indexes:  42
chosen random index:  7325
len of seqence 40
0:  081110 122453 11016 info dfs.datanode$dataxceiver: receiving block blk_-8833916647701959447 src: /10.251.123.33:60036 dest: /10.251.123.33:50010
1:  081110 122453 30 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/randtxt2/_temporary/_task_200811101024_0002_m_001489_0/part-01489. blk_-8833916647701959447
2:  081110 122453 8602 info dfs.datanode$dataxceiver: receiving block blk_-8833916647701959447 src: /10.251.123.33:55473 dest: /10.251.123.33:50010
3:  081110 122454 10971 info dfs.datanode$dataxceiver: receiving block blk_-8833916647701959447 src: /10.250.15.240:56807 dest: /10.250.15.240:50010
4:  081110 122544 10972 info dfs.datanode$p

In [111]:
### for sequnce index 6969 we observed type_1_lines are 13, 32
### this sequence has one additonal lines which has negative words potentiall caryying anomalous information
# type_3_line:
# 34:  081111 023021 27 info dfs.fsnamesystem: block* namesystem.delete: blk_-702618009724393925 is added to invalidset of 10.250.6.214:50010
# 35:  081111 023021 27 info dfs.fsnamesystem: block* namesystem.delete: blk_-702618009724393925 is added to invalidset of 10.251.123.33:50010
# 36:  081111 023021 27 info dfs.fsnamesystem: block* namesystem.delete: blk_-702618009724393925 is added to invalidset of 10.251.70.37:50010
# type_3_line
type_3_line = '081111 023021 27 info dfs.fsnamesystem: block* namesystem.delete: blk_-702618009724393925 is added to invalidset of 10.251.70.37:50010'
print(f'length of type_3_line: {len(type_3_line)}')
# we observed few sequences e.g index 6755, which is having only type_3_line but does not have type_1 or type_2 line
# mostly i observed that there is at least one 'warn' severity with type_1_line then last few lines will be type_3_line 
#length of type_3_line = 134

length of type_3_line: 134


In [126]:
# lets check when the seq len is 29
inspect_sequence_by_seq_len(29)

number of indexes:  612
chosen random index:  4057
len of seqence 29
0:  081110 013053 5573 info dfs.datanode$dataxceiver: receiving block blk_9220264484457339243 src: /10.251.75.163:38278 dest: /10.251.75.163:50010
1:  081110 013054 28 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/randtxt/_temporary/_task_200811092030_0003_m_001179_0/part-01179. blk_9220264484457339243
2:  081110 013054 5483 info dfs.datanode$dataxceiver: receiving block blk_9220264484457339243 src: /10.251.75.163:57319 dest: /10.251.75.163:50010
3:  081110 013054 5611 info dfs.datanode$dataxceiver: receiving block blk_9220264484457339243 src: /10.251.71.68:37545 dest: /10.251.71.68:50010
4:  081110 013125 27 info dfs.fsnamesystem: block* namesystem.addstoredblock: blockmap updated: 10.251.39.192:50010 is added to blk_9220264484457339243 size 67108864
5:  081110 013125 5612 info dfs.datanode$packetresponder: packetresponder 0 for block blk_9220264484457339243 terminating
6:  081110 013125 5612 inf

In [117]:
# for index 12107,  we found all lines are having severity 'info' only one line is having 'warn'
type_4_line = '081111 075722 19 warn dfs.fsdataset: unexpected error trying to delete block blk_-8678353350316563645. blockinfo not found in volumemap.'
print(f'lenght of type_4_line: {len(type_4_line)}')
# type_4_line length is 136
# for index 12767, 4981 it contains type_1 , type_3 and type_4
# 14:  081110 054026 6976 warn dfs.datanode$dataxceiver: 10.250.15.67:50010:got exception while serving blk_8107047540607272383 to /10.251.203.4:
# 22:  081110 103047 34 info dfs.fsnamesystem: block* namesystem.delete: blk_8107047540607272383 is added to invalidset of 10.250.10.176:50010
# 28:  081110 103806 19 warn dfs.fsdataset: unexpected error trying to delete block blk_8107047540607272383. blockinfo not found in volumemap.

# for index 200 there is no 'warn' severity , all are 'info' level severity , yet it is anomalous since there is type_3_line. 
# 23:  081110 103057 30 info dfs.fsnamesystem: block* namesystem.delete: blk_-8213344449220111733 is added to invalidset of 10.251.193.224:50010
# 24:  081110 103057 30 info dfs.fsnamesystem: block* namesystem.delete: blk_-8213344449220111733 is added to invalidset of 10.251.194.147:50010
# 25:  081110 103057 30 info dfs.fsnamesystem: block* namesystem.delete: blk_-8213344449220111733 is added to invalidset of 10.251.30.134:50010

lenght of type_4_line: 136


In [149]:
# lets check those with length 20 , it is the highest ranked. 
inspect_sequence_by_seq_len(20)

number of indexes:  3717
chosen random index:  7655
len of seqence 20
0:  081110 142509 12093 info dfs.datanode$dataxceiver: receiving block blk_8839449461991208613 src: /10.251.203.149:34091 dest: /10.251.203.149:50010
1:  081110 142509 12276 info dfs.datanode$dataxceiver: receiving block blk_8839449461991208613 src: /10.251.203.149:53303 dest: /10.251.203.149:50010
2:  081110 142509 26 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/sortrand2/_temporary/_task_200811101024_0003_r_000356_0/part-00356. blk_8839449461991208613
3:  081110 142510 12031 info dfs.datanode$dataxceiver: receiving block blk_8839449461991208613 src: /10.251.122.65:52701 dest: /10.251.122.65:50010
4:  081110 142527 12032 info dfs.datanode$packetresponder: packetresponder 0 for block blk_8839449461991208613 terminating
5:  081110 142527 12032 info dfs.datanode$packetresponder: received block blk_8839449461991208613 of size 67108864 from /10.251.122.65
6:  081110 142527 12094 info dfs.datanode$pa

In [178]:
# for seq len 20 mostly it is only type_4_line error
# let us check those with low sequece length stariting from 2 
inspect_sequence_by_seq_len(2)

number of indexes:  2950
chosen random index:  11112
len of seqence 2
0:  081111 041153 19740 info dfs.datanode$dataxceiver: receiving block blk_252726837417797968 src: /10.251.91.84:37319 dest: /10.251.91.84:50010
1:  081111 041153 27 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/randtxt4/_temporary/_task_200811101024_0010_m_001177_0/part-01177. blk_252726837417797968


In [None]:
# there is no error as such but it seems that this block is not showing a stanard lifecycle. So anything this short is anomalous

In [213]:
# lets continue with 4 since 3 is not presenetn in the counter distrubution 
inspect_sequence_by_seq_len(4)

number of indexes:  3225
chosen random index:  8894
len of seqence 4
0:  081110 212832 14963 info dfs.datanode$dataxceiver: receiving block blk_-3062036611573455407 src: /10.251.89.155:33118 dest: /10.251.89.155:50010
1:  081110 212832 15064 info dfs.datanode$dataxceiver: receiving block blk_-3062036611573455407 src: /10.251.89.155:41623 dest: /10.251.89.155:50010
2:  081110 212832 35 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand3/_temporary/_task_200811101024_0005_m_000773_0/part-00773. blk_-3062036611573455407
3:  081110 212835 15064 info dfs.datanode$dataxceiver: writeblock blk_-3062036611573455407 received exception java.io.ioexception: could not read from stream


In [408]:
type_5_line = '081110 013835 5479 info dfs.datanode$dataxceiver: writeblock blk_4010936997666386560 received exception java.io.ioexception: could not read from stream'
print(f'length of type_5_line: {len(type_5_line)}')
# for index - 15033 , severity level is normal but there negative words in the message part e.g., 'exception', 'could not read' . 
# as i randomly tested it for 10 times , this pattern repeats

length of type_5_line: 151


In [227]:
inspect_sequence_by_seq_len(8)

number of indexes:  10
chosen random index:  10846
len of seqence 8
0:  081111 033441 18517 info dfs.datanode$dataxceiver: receiving block blk_-7228834955740084829 src: /10.251.203.246:40239 dest: /10.251.203.246:50010
1:  081111 033441 19081 info dfs.datanode$dataxceiver: receiving block blk_-7228834955740084829 src: /10.251.203.246:45900 dest: /10.251.203.246:50010
2:  081111 033441 34 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand4/_temporary/_task_200811101024_0009_m_001861_1/part-01861. blk_-7228834955740084829
3:  081111 033541 18538 info dfs.datanode$packetresponder: packetresponder 2 for block blk_-7228834955740084829 terminating
4:  081111 033541 18538 info dfs.datanode$packetresponder: packetresponder blk_-7228834955740084829 2 exception java.io.eofexception
5:  081111 033541 19081 info dfs.datanode$dataxceiver: writeblock blk_-7228834955740084829 received exception java.net.sockettimeoutexception
6:  081111 033606 18517 info dfs.datanode$blockreceive

In [409]:
# for index 4, 
type_6_line = '081109 205627 784 info dfs.datanode$blockreceiver: exception in receiveblock for block blk_1528078116812077719 java.io.ioexception: connection reset by peer'
print(f'length of type_6_line: {len(type_6_line)}')
# either type_5 or both type_5 and type6 or both appears . Tested several times and seen this pattern  

length of type_6_line: 156


In [245]:
inspect_sequence_by_seq_len(12)

number of indexes:  2
chosen random index:  10737
len of seqence 12
0:  081111 033052 18866 info dfs.datanode$dataxceiver: receiving block blk_4069624653932564389 src: /10.251.30.85:54603 dest: /10.251.30.85:50010
1:  081111 033052 18901 info dfs.datanode$dataxceiver: receiving block blk_4069624653932564389 src: /10.251.38.197:58105 dest: /10.251.38.197:50010
2:  081111 033052 18937 info dfs.datanode$dataxceiver: receiving block blk_4069624653932564389 src: /10.251.30.85:59419 dest: /10.251.30.85:50010
3:  081111 033052 28 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand4/_temporary/_task_200811101024_0009_m_001768_1/part-01768. blk_4069624653932564389
4:  081111 033737 18901 info dfs.datanode$blockreceiver: exception in receiveblock for block blk_4069624653932564389 java.io.eofexception
5:  081111 033737 18901 info dfs.datanode$dataxceiver: writeblock blk_4069624653932564389 received exception java.io.eofexception
6:  081111 033737 18902 info dfs.datanode$packet

In [410]:
type_7_line = '081111 033714 18697 info dfs.datanode$blockreceiver: exception in receiveblock for block blk_-370867073208864215 java.io.eofexception'
print(f'length of type_7_line: {len(type_7_line)}')
type_8_line = '081111 033714 18776 info dfs.datanode$packetresponder: packetresponder blk_-370867073208864215 1 exception java.io.interruptedioexception: interruped while waiting for io on channel java.nio.channels.socketchannel[connected local=/10.251.126.227:37330 remote=/10.251.30.179:50010]. 36411 millis timeout left.'
print(f'length of type_8_line: {len(type_8_line)}')

length of type_7_line: 133
length of type_8_line: 308


In [255]:
inspect_sequence_by_seq_len(13)

number of indexes:  10
chosen random index:  10078
len of seqence 13
0:  081110 230556 16924 info dfs.datanode$dataxceiver: receiving block blk_3709643589615046481 src: /10.251.193.224:50642 dest: /10.251.193.224:50010
1:  081110 230556 27 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/randtxt3/_temporary/_task_200811101024_0007_m_001769_1/part-01769. blk_3709643589615046481
2:  081110 230557 12687 info dfs.datanode$dataxceiver: receiving block blk_3709643589615046481 src: /10.251.122.79:44803 dest: /10.251.122.79:50010
3:  081110 230557 16918 info dfs.datanode$dataxceiver: receiving block blk_3709643589615046481 src: /10.251.193.224:39077 dest: /10.251.193.224:50010
4:  081110 230657 16918 info dfs.datanode$dataxceiver: writeblock blk_3709643589615046481 received exception java.net.sockettimeoutexception: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.socketchannel[connected local=/10.251.122.79:44803 remote=/10.251.123.

In [411]:
type_9_line = '081110 230655 16876 info dfs.datanode$dataxceiver: writeblock blk_-539148231797001003 received exception java.net.sockettimeoutexception: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.socketchannel[connected local=/10.251.194.213:57652 remote=/10.251.123.33:50010]'
print(f'length of type_9_line: {len(type_9_line)}')
type_10_line = '081110 230705 12694 info dfs.datanode$packetresponder: packetresponder 0 for block blk_-539148231797001003 interrupted.'
print(f'length of type_10_line: {len(type_10_line)}')

length of type_9_line: 310
length of type_10_line: 119


In [263]:
inspect_sequence_by_seq_len(14)

number of indexes:  133
chosen random index:  11018
len of seqence 14
0:  081111 040050 19349 info dfs.datanode$dataxceiver: receiving block blk_-5153761630904559883 src: /10.251.214.175:45215 dest: /10.251.214.175:50010
1:  081111 040050 34 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/randtxt4/_temporary/_task_200811101024_0010_m_000496_0/part-00496. blk_-5153761630904559883
2:  081111 040053 18809 info dfs.datanode$dataxceiver: receiving block blk_-5153761630904559883 src: /10.251.214.112:53071 dest: /10.251.214.112:50010
3:  081111 040053 19473 info dfs.datanode$dataxceiver: receiving block blk_-5153761630904559883 src: /10.251.214.175:56235 dest: /10.251.214.175:50010
4:  081111 040145 18810 info dfs.datanode$packetresponder: packetresponder 0 for block blk_-5153761630904559883 terminating
5:  081111 040145 18810 info dfs.datanode$packetresponder: received block blk_-5153761630904559883 of size 67108864 from /10.251.214.112
6:  081111 040145 19350 info dfs.dat

In [412]:
type_11_line = '081111 085518 33 warn dfs.fsnamesystem: block* namesystem.addstoredblock: redundant addstoredblock request received for blk_-8803822423732992516 on 10.251.74.79:50010 size 3558167'
print(f'length of type_11_line: {len(type_11_line)}')

length of type_11_line: 179


In [278]:
inspect_sequence_by_seq_len(15)

number of indexes:  11
chosen random index:  15539
len of seqence 15
0:  081111 080829 34 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand7/_temporary/_task_200811101024_0014_m_000287_0/part-00287. blk_2429961036831384938
1:  081111 080830 24123 info dfs.datanode$dataxceiver: receiving block blk_2429961036831384938 src: /10.251.111.209:45842 dest: /10.251.111.209:50010
2:  081111 080830 24159 info dfs.datanode$dataxceiver: receiving block blk_2429961036831384938 src: /10.251.111.209:60820 dest: /10.251.111.209:50010
3:  081111 080830 24196 info dfs.datanode$dataxceiver: receiving block blk_2429961036831384938 src: /10.251.127.47:42224 dest: /10.251.127.47:50010
4:  081111 080833 24197 info dfs.datanode$packetresponder: packetresponder 0 for block blk_2429961036831384938 terminating
5:  081111 080833 24197 info dfs.datanode$packetresponder: received block blk_2429961036831384938 of size 3549062 from /10.251.127.47
6:  081111 080833 26 info dfs.fsnamesystem: block*

In [413]:
# for index - 15539 ,nothing seems to be an error . however we get lines with terminating and hence marking it as type_12
type_12_line = '081111 083917 24782 info dfs.datanode$packetresponder: packetresponder 2 for block blk_6956386314858320497 terminating'

In [291]:
inspect_sequence_by_seq_len(17)

number of indexes:  12
chosen random index:  16837
len of seqence 17
0:  081111 110413 26687 info dfs.datanode$dataxceiver: receiving block blk_-9128742458709757181 src: /10.251.203.179:60748 dest: /10.251.203.179:50010
1:  081111 110413 26949 info dfs.datanode$dataxceiver: receiving block blk_-9128742458709757181 src: /10.251.203.179:38219 dest: /10.251.203.179:50010
2:  081111 110413 27424 info dfs.datanode$dataxceiver: receiving block blk_-9128742458709757181 src: /10.250.14.224:43759 dest: /10.250.14.224:50010
3:  081111 110413 33 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/randtxt9/_temporary/_task_200811101024_0016_m_002002_0/part-02002. blk_-9128742458709757181
4:  081111 110423 26688 info dfs.datanode$packetresponder: packetresponder 2 for block blk_-9128742458709757181 terminating
5:  081111 110423 26688 info dfs.datanode$packetresponder: received block blk_-9128742458709757181 of size 49648902 from /10.251.203.179
6:  081111 110423 26950 info dfs.datano

In [415]:
type_13_line = '081111 050934 16 warn dfs.pendingreplicationblocks$pendingreplicationmonitor: pendingreplicationmonitor timed out block blk_274401605260075764'
print(f'length of type_13_line: {len(type_13_line)}')
# apart fro type_13 , we have index-15546 where again nor error was found excep type_12 terminating statements
# for index - 16837 , 
type_14_line = '081109 213841 28 info dfs.fsnamesystem: block* namesystem.addstoredblock: addstoredblock request received for blk_5398314277015661293 on 10.251.123.33:50010 size 67108864 but it does not belong to any file.'
print(f'length of type_14_line: {len(type_14_line)}')

length of type_13_line: 142
length of type_14_line: 206


In [329]:
inspect_sequence_by_seq_len(19)

number of indexes:  11
chosen random index:  15903
len of seqence 19
0:  081111 085839 19415 info dfs.datanode$dataxceiver: receiving block blk_9173199815015538212 src: /10.251.39.160:45335 dest: /10.251.39.160:50010
1:  081111 085839 33 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand7/_temporary/_task_200811101024_0014_m_002012_0/part-02012. blk_9173199815015538212
2:  081111 085909 19416 info dfs.datanode$packetresponder: packetresponder 2 for block blk_9173199815015538212 terminating
3:  081111 085909 19416 info dfs.datanode$packetresponder: received block blk_9173199815015538212 of size 1781279 from /10.251.39.160
4:  081111 085909 19447 info dfs.datanode$dataxceiver: receiving block blk_9173199815015538212 src: /10.251.39.160:48163 dest: /10.251.39.160:50010
5:  081111 085909 19448 info dfs.datanode$packetresponder: packetresponder 1 for block blk_9173199815015538212 terminating
6:  081111 085909 19448 info dfs.datanode$packetresponder: received block blk_9

In [416]:
# for index 13540 type_12_line  - 4:  081111 065201 22049 info dfs.datanode$packetresponder: packetresponder 2 for block blk_-5236249422860631106 terminating
# for index 13540  - type_14_line - this pattern found with several indexes
type_15_line = '081111 080945 23627 info dfs.datanode$blockreceiver: receiving empty packet for block blk_2735204420813474626'
# type 15  i have lost it somehow, shoukd have noted the index , keeping it here , if we get it later , we must note the index

In [417]:
err_lines = [type_1_line, type_2_line, type_3_line, type_4_line, type_5_line, type_6_line, type_7_line, type_8_line, type_9_line, type_10_line, type_11_line, type_12_line, type_13_line, type_14_line, type_15_line]
for i, err in enumerate(err_lines):
    print(f'err-{i+1}:  length: {len(err)},  {err}')

err-1:  length: 142,  081109 213852 2573 warn dfs.datanode$dataxceiver: 10.251.203.149:50010:got exception while serving blk_-8531310335568756456 to /10.251.39.144:
err-2:  length: 207,  081110 104628 27 info dfs.fsnamesystem: block* namesystem.addstoredblock: addstoredblock request received for blk_-8531310335568756456 on 10.251.106.10:50010 size 67108864 but it does not belong to any file.
err-3:  length: 134,  081111 023021 27 info dfs.fsnamesystem: block* namesystem.delete: blk_-702618009724393925 is added to invalidset of 10.251.70.37:50010
err-4:  length: 136,  081111 075722 19 warn dfs.fsdataset: unexpected error trying to delete block blk_-8678353350316563645. blockinfo not found in volumemap.
err-5:  length: 151,  081110 013835 5479 info dfs.datanode$dataxceiver: writeblock blk_4010936997666386560 received exception java.io.ioexception: could not read from stream
err-6:  length: 156,  081109 205627 784 info dfs.datanode$blockreceiver: exception in receiveblock for block blk_15

In [None]:
# looking at this error lines we can see
# shortest of them is 122 . WHile the longest is 314 
# with 128 pre truncate we can retain at least the message part which anyways contain the anomaly information. 
# keeping the length more and having more padding is good or shorter line with more trancated character is good ?
# The 3rd startegy is to remove selected words and numbers - blk address and source and ip address are three good candidates for sure

In [443]:
import re
txt = type_9_line
def remove_unwanted_characters_n_words(txt_line):
    print(f'original Line: {txt_line}, original length: {len(txt_line)}' )   
    rm_time_stamp=True
    rm_msg_source=True
    rm_blk_ids_regex=False
    rm_ip_address=True
    rm_signs_n_punctuations=True
    rm_white_space=True

    time_stamp = ''
    msg_source = ''
    blk_ids_regex = ''
    ip_address = ''
    signs_n_punctuations = ''
    white_space = ''

    if rm_time_stamp:
        time_stamp = '^\d+\s\d+\s\d+' 
    if rm_msg_source:
        msg_source = 'dfs\.\w+[$]\w+:|dfs\.\w+:'
    if rm_blk_ids_regex:
       # blk_ids_regex = 'blk_-\d+\.?'
       blk_ids_regex = 'blk_-?\d+\.?'
    if rm_ip_address:
        ip_address = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:*\d*'
    if rm_signs_n_punctuations:
        signs_n_punctuations = '\]|\[|\)|\(|\=|\,|\;|\/'
    if rm_white_space:
        white_space = '\s'

    pat = f'{time_stamp}|{msg_source}|{blk_ids_regex}|{ip_address}|{signs_n_punctuations}|{white_space}'
    # print('regex pattern:', pat)
    # s = re.sub('\]|\[|\)|\(|\=|\,|\;|\/|blk_-\d+\.*|\sdfs\.\w+[$]\w+:|\sdfs\.\w+:|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+', '', txt)
    # s = re.sub('\s|\]|\[|\)|\(|\=|\,|\;|\/|blk_-\d+\.*|dfs\.\w+[$]\w+:|dfs\.\w+:|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+|^\d+\s\d+\s\d+', '', txt)
    s = re.sub(pat, '', txt_line)
    print(f'cleaned line: {s},  cleaned length: {len(s)}')
    print()
    # print('length after regex removal: ', len(s))
    # s1 = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', txt))
    # print(s1)

In [444]:
 remove_unwanted_characters_n_words(type_6_line)

original Line: 081109 205627 784 info dfs.datanode$blockreceiver: exception in receiveblock for block blk_1528078116812077719 java.io.ioexception: connection reset by peer, original length: 156
cleaned line: infoexceptioninreceiveblockforblockblk_1528078116812077719java.io.ioexception:connectionresetbypeer,  cleaned length: 99



In [445]:
 remove_unwanted_characters_n_words(type_4_line)

original Line: 081111 075722 19 warn dfs.fsdataset: unexpected error trying to delete block blk_-8678353350316563645. blockinfo not found in volumemap., original length: 136
cleaned line: warnunexpectederrortryingtodeleteblockblk_-8678353350316563645.blockinfonotfoundinvolumemap.,  cleaned length: 92



In [446]:
for eline in err_lines:
    remove_unwanted_characters_n_words(eline)

original Line: 081109 213852 2573 warn dfs.datanode$dataxceiver: 10.251.203.149:50010:got exception while serving blk_-8531310335568756456 to /10.251.39.144:, original length: 142
cleaned line: warn:gotexceptionwhileservingblk_-8531310335568756456to,  cleaned length: 55

original Line: 081110 104628 27 info dfs.fsnamesystem: block* namesystem.addstoredblock: addstoredblock request received for blk_-8531310335568756456 on 10.251.106.10:50010 size 67108864 but it does not belong to any file., original length: 207
cleaned line: infoblock*namesystem.addstoredblock:addstoredblockrequestreceivedforblk_-8531310335568756456onsize67108864butitdoesnotbelongtoanyfile.,  cleaned length: 134

original Line: 081111 023021 27 info dfs.fsnamesystem: block* namesystem.delete: blk_-702618009724393925 is added to invalidset of 10.251.70.37:50010, original length: 134
cleaned line: infoblock*namesystem.delete:blk_-702618009724393925isaddedtoinvalidsetof,  cleaned length: 72

original Line: 081111 075722 1

In [40]:
pseq = pos_seqs[1001]
print('len of seqence', len(pseq))
print(pseq)
line = pseq[-3:-2]
print('anomalous line: ', line)
tknum = tk.texts_to_sequences(line)[0]
print('anomalous line after num conversion: ', tknum)
print('len of anomalus line after num conversion', len(tknum))

len of seqence 4
['081109 211131 1531 info dfs.datanode$dataxceiver: receiving block blk_4066061011255450889 src: /10.251.71.193:47038 dest: /10.251.71.193:50010', '081109 211131 1580 info dfs.datanode$dataxceiver: receiving block blk_4066061011255450889 src: /10.251.71.193:37194 dest: /10.251.71.193:50010', '081109 211131 30 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand/_temporary/_task_200811092030_0001_m_001316_0/part-01316. blk_4066061011255450889', '081109 211134 1580 info dfs.datanode$dataxceiver: writeblock blk_4066061011255450889 received exception java.io.ioexception: could not read from stream']
anomalous line:  ['081109 211131 1580 info dfs.datanode$dataxceiver: receiving block blk_4066061011255450889 src: /10.251.71.193:37194 dest: /10.251.71.193:50010']
anomalous line after num conversion:  [4, 12, 3, 3, 4, 21, 2, 7, 3, 3, 3, 16, 3, 2, 3, 10, 12, 4, 2, 20, 17, 22, 9, 2, 6, 22, 8, 14, 6, 11, 13, 11, 17, 9, 6, 5, 36, 6, 11, 13, 11, 41, 26, 5, 20, 33

In [46]:
pseq = pos_seqs[501]
print('len of seqence', len(pseq))
print(pseq)


len of seqence 2
['081109 205507 35 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand/_temporary/_task_200811092030_0001_m_000670_0/part-00670. blk_8554401366568742969', '081109 205507 901 info dfs.datanode$dataxceiver: receiving block blk_8554401366568742969 src: /10.251.214.67:39782 dest: /10.251.214.67:50010']


In [45]:
pseq = pos_seqs[1000]
print('len of seqence', len(pseq))
print(pseq)


len of seqence 2
['081109 211130 1528 info dfs.datanode$dataxceiver: receiving block blk_-440001254298295966 src: /10.251.30.101:37973 dest: /10.251.30.101:50010', '081109 211130 33 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand/_temporary/_task_200811092030_0001_m_001327_0/part-01327. blk_-440001254298295966']


In [48]:
pseq = pos_seqs[3000]
print('len of seqence', len(pseq))
print(pseq)
line = pseq[-3:-2]
print('anomalous line: ', line)
tknum = tk.texts_to_sequences(line)[0]
print('anomalous line after num conversion: ', tknum)
print('len of anomalus line after num conversion', len(tknum))

len of seqence 21
['081110 002208 33 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000035_0/part-00035. blk_1877206842059209018', '081110 002208 4778 info dfs.datanode$dataxceiver: receiving block blk_1877206842059209018 src: /10.250.7.230:53527 dest: /10.250.7.230:50010', '081110 002208 4888 info dfs.datanode$dataxceiver: receiving block blk_1877206842059209018 src: /10.250.7.230:49359 dest: /10.250.7.230:50010', '081110 002209 4426 info dfs.datanode$dataxceiver: receiving block blk_1877206842059209018 src: /10.250.15.67:40656 dest: /10.250.15.67:50010', '081110 002229 26 info dfs.fsnamesystem: block* namesystem.addstoredblock: blockmap updated: 10.250.7.230:50010 is added to blk_1877206842059209018 size 67108864', '081110 002229 28 info dfs.fsnamesystem: block* namesystem.addstoredblock: blockmap updated: 10.250.15.67:50010 is added to blk_1877206842059209018 size 67108864', '081110 002229 31 info dfs.fsnamesystem: bl

In [12]:
loglines = hlogs.logs

In [13]:
line_len = [len(l) for l in loglines]
print('average len of line: ',mean(line_len))
print('most frequent len of line: ', mode(line_len))
print('max len of line: ', max(line_len))

average len of line:  139.19857647386112
most frequent len of line:  142
max len of line:  320


In [14]:
l_cnt = Counter(line_len)
print('line_len  distribution: ', l_cnt)
def check_impact(Llen):
    num_of_datapoints = []
    for k,v in l_cnt.items():
        if k > Llen:
            num_of_datapoints.append(v)
    print('number of lines which will be affected: ', (num_of_datapoints))
check_impact(150)

line_len  distribution:  Counter({142: 879818, 144: 866983, 118: 807452, 161: 597148, 162: 581448, 119: 579315, 130: 577635, 143: 542696, 135: 520207, 136: 489953, 131: 481407, 141: 386080, 129: 343761, 117: 269537, 160: 267493, 140: 229223, 134: 207683, 145: 202068, 163: 182057, 137: 171304, 132: 146730, 146: 143714, 128: 141255, 171: 140138, 174: 124430, 125: 113859, 175: 112260, 139: 111106, 172: 100470, 124: 93900, 133: 91727, 126: 82295, 159: 80862, 127: 73393, 94: 59621, 95: 53255, 123: 51262, 170: 47663, 138: 47020, 116: 46582, 173: 29809, 122: 27594, 158: 14742, 176: 14617, 121: 10220, 93: 6337, 169: 4955, 120: 4791, 115: 4790, 157: 2477, 152: 1553, 153: 843, 151: 828, 92: 750, 109: 720, 156: 598, 206: 590, 207: 587, 168: 542, 114: 541, 110: 464, 180: 370, 181: 325, 155: 310, 108: 242, 150: 195, 179: 137, 154: 124, 205: 104, 182: 96, 113: 66, 91: 54, 167: 47, 178: 39, 149: 36, 107: 29, 164: 28, 90: 19, 308: 15, 310: 14, 165: 11, 204: 11, 312: 11, 309: 11, 203: 10, 148: 10, 112:

In [15]:
x_train[0]

['081111 075208 23798 info dfs.datanode$dataxceiver: receiving block blk_2613917614913604807 src: /10.251.43.210:43694 dest: /10.251.43.210:50010',
 '081111 075208 34 info dfs.fsnamesystem: block* namesystem.allocateblock: /user/root/rand6/_temporary/_task_200811101024_0013_m_001947_0/part-01947. blk_2613917614913604807',
 '081111 075209 23589 info dfs.datanode$dataxceiver: receiving block blk_2613917614913604807 src: /10.251.43.210:43962 dest: /10.251.43.210:50010',
 '081111 075209 23919 info dfs.datanode$dataxceiver: receiving block blk_2613917614913604807 src: /10.251.30.134:60664 dest: /10.251.30.134:50010',
 '081111 075247 23590 info dfs.datanode$packetresponder: packetresponder 1 for block blk_2613917614913604807 terminating',
 '081111 075247 23590 info dfs.datanode$packetresponder: received block blk_2613917614913604807 of size 67108864 from /10.251.43.210',
 '081111 075247 23799 info dfs.datanode$packetresponder: packetresponder 2 for block blk_2613917614913604807 terminating',

In [16]:
x_train_num, y_train_num, x_test_num, y_test_num = hlogs.get_train_test_data_num(ablation=10)

starting text to number conversion
ending text to number conversion: 205.1829378604889
ending padding characters: 55.43957161903381
padded_txt_to_num shape: (11175629, 150)
RAM usage:  6705377520
completed:  0
ending blk sequencing: 0.0
completed:  1000000
ending blk sequencing: 1.8177282810211182
completed:  2000000
ending blk sequencing: 3.8286449909210205
completed:  3000000
ending blk sequencing: 5.827388286590576
completed:  4000000
ending blk sequencing: 7.731710195541382
completed:  5000000
ending blk sequencing: 9.75119137763977
completed:  6000000
ending blk sequencing: 11.677578687667847
completed:  7000000
ending blk sequencing: 13.565303802490234
completed:  8000000
ending blk sequencing: 15.513335943222046
completed:  9000000
ending blk sequencing: 17.398203134536743
completed:  10000000
ending blk sequencing: 19.33232879638672
completed:  11000000
ending blk sequencing: 21.18190598487854
RAM usage:  184627341
RAM usage:  189227829
getting ablation data: 10
10 16828
length

In [17]:
x_train_num[0]

array([[ 4, 12,  3, ...,  0,  0,  0],
       [ 4, 12,  3, ...,  0,  0,  0],
       [14, 22,  8, ..., 18, 21, 16],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [18]:
x_train_num[0][0]

array([ 4, 12,  3,  3,  3,  3,  2,  3,  4,  3,  4,  3,  3,  2,  7, 18,  7,
        7,  3,  2, 20, 17, 22,  9,  2,  6, 22,  8, 14,  6, 11, 13, 11, 17,
        9,  6,  5, 36,  6, 11, 13, 11, 41, 26,  5, 20, 33,  5, 24, 28,  2,
       24,  5, 26,  5, 20, 33, 20, 17, 37,  2, 27, 23,  9, 26, 25,  2, 27,
       23, 25, 32, 35, 12,  3,  4, 15, 18, 21, 18,  4, 16,  4, 10,  7, 10,
       19, 10, 21, 18, 21, 16,  2,  8, 24, 26, 28,  2, 30,  3,  4, 14,  7,
       10,  3, 14, 19, 15, 14, 19, 21, 28, 16, 19, 12,  3,  7,  2,  6,  5,
        8, 13, 28,  2, 30,  3,  4, 14,  7, 10,  3, 14, 19, 15, 14, 19, 21,
       28, 10,  4,  4,  3,  4,  0,  0,  0,  0,  0,  0,  0,  0])

In [23]:
padded_text_to_num = hlogs.padded_txt_to_num

In [25]:
padded_text_to_num[0]

array([ 4, 12,  3,  3,  4, 21,  2,  7,  4, 16, 10,  3, 12,  2,  3, 15, 16,
        2, 20, 17, 22,  9,  2,  6, 22,  8, 14,  6, 11, 13, 11, 17,  9,  6,
        5, 36,  6, 11, 13, 11, 41, 26,  5, 20, 33,  5, 24, 28,  2, 24,  5,
       26,  5, 20, 33, 20, 17, 37,  2, 27, 23,  9, 26, 25,  2, 27, 23, 25,
       32, 35,  3, 18,  4, 12, 21, 21, 21, 18, 12, 19, 21,  3, 21, 12, 18,
        7, 21,  4, 18,  2,  8, 24, 26, 28,  2, 30,  3,  4, 14,  7, 10,  4,
       14,  3, 21, 14,  3,  4,  7, 28, 10, 15,  3,  4, 18,  2,  6,  5,  8,
       13, 28,  2, 30,  3,  4, 14,  7, 10,  4, 14,  3, 21, 14,  3,  4,  7,
       28, 10,  4,  4,  3,  4,  0,  0,  0,  0,  0,  0,  0,  0])

In [32]:
tk = hlogs.tk
tk.sequences_to_texts([padded_text_to_num[0]])

['0 8 1 1 0 9   2 0 3 5 1 8   1 4 3   i n f o   d f s . d a t a n o d e $ d a t a x c e i v e r :   r e c e i v i n g   b l o c k   b l k _ - 1 6 0 8 9 9 9 6 8 7 9 1 9 8 6 2 9 0 6   s r c :   / 1 0 . 2 5 0 . 1 9 . 1 0 2 : 5 4 1 0 6   d e s t :   / 1 0 . 2 5 0 . 1 9 . 1 0 2 : 5 0 0 1 0 UNK UNK UNK UNK UNK UNK UNK UNK']

In [35]:
loglines[0]

'081109 203518 143 info dfs.datanode$dataxceiver: receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010'

In [41]:
txt = '081109 214132 2617 warn dfs.datanode$dataxceiver: 10.251.67.4:50010'
len(txt)

67

In [None]:
df1 = hlogs.seq_of_log_nums
df1

In [447]:
d = {'a': 1, 'b':2}
d.items()

dict_items([('a', 1), ('b', 2)])