In [1]:
import re
import pandas as pd

In [2]:
file = "HDFS.log"

In [3]:
hdfs = open(file, "r")

In [4]:
hdfs_list = hdfs.readlines()

In [5]:
#for i in range(1,100):
    #print(hdfs_list[i])

## Total Number of log entries

In [6]:
len(hdfs_list)

11175629

Here is an example of a log event from the HDFS log file and this shows what the event contains.

![log event breakdown](event_breakdown.png)

## Unique values for first 6 numerical digits (date)

This looks like the dates of the events. Date doesn't seem like it will help with finding anomolies if we have many entries at each date.

In [7]:
first_six = []
for i in hdfs_list:
    match = re.findall('^[0-9]{6}', i)
    match = match[0]
    first_six.append(match)

In [8]:
unique_first_six = list(set(first_six))
print(unique_first_six)
len(first_six)

['081109', '081111', '081110']


11175629

## Unique values for second 6 numerical digits (time?)

It looks like there are tons of different values here, so this probably won't be significant in determining anomolies and we may not want it include it in our model. 

In [9]:
second_six = []
for i in hdfs_list:
    match = re.findall('^[0-9]{6} ([0-9]{6})', i)
    match = match[0]
    second_six.append(match)

In [10]:
unique_second_six = list(set(second_six))
print(len(unique_second_six))
len(second_six)

85053


11175629

## Unique values for third number (code)

There are a lot of different codes here. We will have to count through these to see if there are any that are rare. 

We may want to convert these to integers.

In [11]:
third_num = []
for i in hdfs_list:
    match = re.findall('^[0-9]{6} [0-9]{6} (\d*) ', i)
    match = match[0]
    third_num.append(match)

In [12]:
unique_third_num = list(set(third_num))
print(len(unique_third_num))
len(third_num)

27799


11175629

## Unique values for first text statement (message)

In [13]:
caps = []
for i in hdfs_list:
    match = re.findall(' ([A-Z][A-Z][A-Z][A-Z]) ', i)
    match = match[0]
    caps.append(match)

In [14]:
unique_caps = list(set(caps))
print(unique_caps)
print(len(caps))

['INFO', 'WARN']
11175629


## Unique values for "dfs....:" statement (event type)

There are only 9 of these codes. These may be important for finding anomolies.

I think we could make a dataframe for each of these 9 codes, then we can find the specific information that are in each one. 

In [15]:
dfs = []
for i in hdfs_list:
    match = re.findall('dfs\.\S*:', i)
    match = match[0]
    dfs.append(match)

In [16]:
unique_dfs = list(set(dfs))
print(len(unique_dfs))
print(unique_dfs)
print(len(dfs))

9
['dfs.PendingReplicationBlocks$PendingReplicationMonitor:', 'dfs.DataNode$BlockReceiver:', 'dfs.FSDataset:', 'dfs.DataBlockScanner:', 'dfs.DataNode:', 'dfs.FSNamesystem:', 'dfs.DataNode$PacketResponder:', 'dfs.DataNode$DataXceiver:', 'dfs.DataNode$DataTransfer:']
11175629


## Labels information

It looks like the blocks are labelled, and each log entry contains at least one block. It looks like we will have to somehow find out which blocks are anomolies and which ones are normal. 

When do we determine if a block is normal or an anomoly?

Blocks must be in more than one event (there are a lot more events than anomoly labels), so are we finding events that are anomolies? Or blocks that are anomolies?

In [17]:
labels = pd.read_csv("anomaly_label.csv")

In [18]:
labels.head()

Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal


In [19]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575061 entries, 0 to 575060
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   BlockId  575061 non-null  object
 1   Label    575061 non-null  object
dtypes: object(2)
memory usage: 8.8+ MB


## Finding the events with more than one block named in it:

This cells shows how many events mention more than one block in it. We can see that there are 1402056 events that mention more than one block. This doesn't check if the multiple blocks mentioned are identical to each other or not. 

In [52]:
multiple_blocks_test = []
for i in hdfs_list:
    match = re.findall('blk_\S*', i)
    if len(match) > 1:
        multiple_blocks_test.append(match)

In [57]:
len(multiple_blocks_test)

1402056

In [56]:
multiple_blocks_test[1:10]

[['blk_-6899869435641005946', 'blk_-6899869435641005946'],
 ['blk_-8191677345482862686', 'blk_-8191677345482862686'],
 ['blk_-919439116365725304', 'blk_-919439116365725304'],
 ['blk_3557914126063085372', 'blk_3557914126063085372'],
 ['blk_3584224065406961324', 'blk_3584224065406961324'],
 ['blk_349284099419601276', 'blk_349284099419601276'],
 ['blk_-7057732666118938934', 'blk_-7057732666118938934'],
 ['blk_-5410886886439711883', 'blk_-5410886886439711883'],
 ['blk_-6739860488313725269', 'blk_-6739860488313725269']]

We can see from the cell above that the when multiple blocks are mentioned, it looks like they are duplicates. 

In the cell below we will check how many events that mention more than one block actually mention different unique blocks. We can see that there are no lines that mention more than one unique block, so we can focus on only the first block that is mentioned in each event (since any additional blocks mentioned are the same as the first one).

In [58]:
multiple_blocks = []
for i in hdfs_list:
    match = re.findall('blk_\S*', i)
    if len(match) > 1:
        holder = 0
        first = match[0]
        for x in match:
            if x != first:
                holder = 1
        if holder == 1:
            multiple_blocks.append(i)

In [59]:
len(multiple_blocks)

0

## Determining the number of unique blocks mentioned in this entire HDFS file (block id's):

In [60]:
blocks = []
for i in hdfs_list:
    match = re.findall('blk_\S*', i)
    match = match[0]
    blocks.append(match)

In [63]:
unique_blocks = list(set(blocks))
print(len(blocks))
print(len(unique_blocks))
print(len(labels))

11175629
580406
575061


We can see from the cell above that the number of unique blocks in the HDFS file is within about 5000 of the number of labelled blocks in the `anomaly_label.csv` file.

In [64]:
hdfs.close()