In [1]:
import re
import pandas as pd

In [2]:
file = "HDFS.log"

In [3]:
hdfs = open(file, "r")

In [4]:
hdfs_list = hdfs.readlines()

In [5]:
#for i in range(1,100):
    #print(hdfs_list[i])

## Total Number of log entries

In [6]:
len(hdfs_list)

11175629

Here is an example of a log event from the HDFS log file and this shows what the event contains.

![log event breakdown](event_breakdown.png)

## Unique values for first 6 numerical digits (date)

This looks like the dates of the events. Date doesn't seem like it will help with finding anomolies if we have many entries at each date.

In [7]:
first_six = []
for i in hdfs_list:
    match = re.findall('^[0-9]{6}', i)
    match = match[0]
    first_six.append(match)

In [8]:
unique_first_six = list(set(first_six))
print(unique_first_six)
len(first_six)

['081109', '081110', '081111']


11175629

## Unique values for second 6 numerical digits (time?)

It looks like there are tons of different values here, so this probably won't be significant in determining anomolies and we may not want it include it in our model. 

In [9]:
second_six = []
for i in hdfs_list:
    match = re.findall('^[0-9]{6} ([0-9]{6})', i)
    match = match[0]
    second_six.append(match)

In [10]:
unique_second_six = list(set(second_six))
print(len(unique_second_six))
len(second_six)

85053


11175629

## Unique values for third number (code)

There are a lot of different codes here. We will have to count through these to see if there are any that are rare. 

We may want to convert these to integers.

In [11]:
third_num = []
for i in hdfs_list:
    match = re.findall('^[0-9]{6} [0-9]{6} (\d*) ', i)
    match = match[0]
    third_num.append(match)

In [12]:
unique_third_num = list(set(third_num))
print(len(unique_third_num))
len(third_num)

27799


11175629

## Unique values for first text statement (message)

In [13]:
caps = []
for i in hdfs_list:
    match = re.findall(' ([A-Z][A-Z][A-Z][A-Z]) ', i)
    match = match[0]
    caps.append(match)

In [14]:
unique_caps = list(set(caps))
print(unique_caps)
print(len(caps))

['WARN', 'INFO']
11175629


## Unique values for "dfs....:" statement (event type)

There are only 9 of these codes. These may be important for finding anomolies.

I think we could make a dataframe for each of these 9 codes, then we can find the specific information that are in each one. 

In [15]:
dfs = []
for i in hdfs_list:
    match = re.findall('dfs\.\S*:', i)
    match = match[0]
    dfs.append(match)

In [16]:
unique_dfs = list(set(dfs))
print(len(unique_dfs))
print(unique_dfs)
print(len(dfs))

9
['dfs.DataNode:', 'dfs.PendingReplicationBlocks$PendingReplicationMonitor:', 'dfs.DataNode$BlockReceiver:', 'dfs.FSNamesystem:', 'dfs.DataNode$DataXceiver:', 'dfs.DataBlockScanner:', 'dfs.FSDataset:', 'dfs.DataNode$DataTransfer:', 'dfs.DataNode$PacketResponder:']
11175629


## Labels information

It looks like the blocks are labelled, and each log entry contains at least one block. It looks like we will have to somehow find out which blocks are anomolies and which ones are normal. 

When do we determine if a block is normal or an anomoly?

Blocks must be in more than one event (there are a lot more events than anomoly labels), so are we finding events that are anomolies? Or blocks that are anomolies?

In [17]:
labels = pd.read_csv("anomaly_label.csv")

In [18]:
labels.head()

Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal


In [19]:
num_anomaly = sum(labels['Label'] == "Anomaly")

In [20]:
num_normal = sum(labels['Label'] == "Normal")

In [21]:
num_anomaly/num_normal

0.030163572622410758

In [22]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575061 entries, 0 to 575060
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   BlockId  575061 non-null  object
 1   Label    575061 non-null  object
dtypes: object(2)
memory usage: 8.8+ MB


## Finding the events with more than one block named in it:

This cells shows how many events mention more than one block in it. We can see that there are 1402056 events that mention more than one block. This doesn't check if the multiple blocks mentioned are identical to each other or not. 

In [23]:
multiple_blocks_test = []
for i in hdfs_list:
    match = re.findall('blk_\S*', i)
    if len(match) > 1:
        multiple_blocks_test.append(match)

In [24]:
len(multiple_blocks_test)

1402056

In [25]:
multiple_blocks_test[1:10]

[['blk_-6899869435641005946', 'blk_-6899869435641005946'],
 ['blk_-8191677345482862686', 'blk_-8191677345482862686'],
 ['blk_-919439116365725304', 'blk_-919439116365725304'],
 ['blk_3557914126063085372', 'blk_3557914126063085372'],
 ['blk_3584224065406961324', 'blk_3584224065406961324'],
 ['blk_349284099419601276', 'blk_349284099419601276'],
 ['blk_-7057732666118938934', 'blk_-7057732666118938934'],
 ['blk_-5410886886439711883', 'blk_-5410886886439711883'],
 ['blk_-6739860488313725269', 'blk_-6739860488313725269']]

We can see from the cell above that the when multiple blocks are mentioned, it looks like they are duplicates. 

In the cell below we will check how many events that mention more than one block actually mention different unique blocks. We can see that there are no lines that mention more than one unique block, so we can focus on only the first block that is mentioned in each event (since any additional blocks mentioned are the same as the first one).

In [26]:
multiple_blocks = []
for i in hdfs_list:
    match = re.findall('blk_\S*', i)
    if len(match) > 1:
        holder = 0
        first = match[0]
        for x in match:
            if x != first:
                holder = 1
        if holder == 1:
            multiple_blocks.append(i)

In [27]:
len(multiple_blocks)

0

## Determining the number of unique blocks mentioned in this entire HDFS file (block id's):

In [28]:
blocks = []
for i in hdfs_list:
    match = re.findall('blk_\S*', i)
    match = match[0]
    blocks.append(match)

In [29]:
unique_blocks = list(set(blocks))
print(len(blocks))
print(len(unique_blocks))
print(len(labels))

11175629
580406
575061


We can see from the cell above that the number of unique blocks in the HDFS file is within about 5000 of the number of labelled blocks in the `anomaly_label.csv` file.

## Planned data wrangling:
- Determine how many blocks have each attribute (how many INFO vs. WARN, how many of each event type, how many with each numerical code)
- Onehot encode the block logs event details for each type of log event (start with 9 dataframes)
- Combine the 9 dataframes using the block ID's to get one large dataframe with the following structure:

![blocks_df](blocks_df.png)

In [30]:
unique_dfs

['dfs.DataNode:',
 'dfs.PendingReplicationBlocks$PendingReplicationMonitor:',
 'dfs.DataNode$BlockReceiver:',
 'dfs.FSNamesystem:',
 'dfs.DataNode$DataXceiver:',
 'dfs.DataBlockScanner:',
 'dfs.FSDataset:',
 'dfs.DataNode$DataTransfer:',
 'dfs.DataNode$PacketResponder:']

## Creating a list of "dfs.FSNamesystem:" events

In [31]:
df_0_list = []
for i in hdfs_list:
    match = 'dfs.FSNamesystem:'
    if match in i:
        df_0_list.append(i)

In [32]:
# number of "dfs.FSNamesystem:" events
len(df_0_list)

3700245

## Creating a list of "dfs.DataBlockScanner:" events

In [33]:
df_1_list = []
for i in hdfs_list:
    match = 'dfs.DataBlockScanner:'
    if match in i:
        df_1_list.append(i)

In [34]:
len(df_1_list)

120046

## Creating a list of "dfs.FSDataset:" events

In [35]:
df_2_list = []
for i in hdfs_list:
    match = 'dfs.FSDataset:'
    if match in i:
        df_2_list.append(i)

len(df_2_list)

1407597

## Creating a list of "dfs.PendingReplicationBlocks$PendingReplicationMonitor:" events

In [36]:
df_3_list = []
for i in hdfs_list:
    match = 'dfs.PendingReplicationBlocks$PendingReplicationMonitor:'
    if match in i:
        df_3_list.append(i)

len(df_3_list)

47

## Creating a list of "dfs.DataNode$DataTransfer:" events

In [37]:
df_4_list = []
for i in hdfs_list:
    match = 'dfs.DataNode$DataTransfer:'
    if match in i:
        df_4_list.append(i)

len(df_4_list)

6946

## Creating a list of "dfs.DataNode:" events

In [38]:
df_5_list = []
for i in hdfs_list:
    match = 'dfs.DataNode:'
    if match in i:
        df_5_list.append(i)

len(df_5_list)

7002

## Creating a list of "dfs.DataNode$DataXceiver:" events

In [39]:
df_6_list = []
for i in hdfs_list:
    match = 'dfs.DataNode$DataXceiver:'
    if match in i:
        df_6_list.append(i)

len(df_6_list)

2518678

## Create a list of "dfs.DataNode$BlockReceiver:" events

In [40]:
df_7_list = []
for i in hdfs_list:
    match = 'dfs.DataNode$BlockReceiver:'
    if match in i:
        df_7_list.append(i)

len(df_7_list)

1718

## Create a list of "dfs.DataNode$PacketResponder:" events

In [41]:
df_8_list = []
for i in hdfs_list:
    match = 'dfs.DataNode$PacketResponder:'
    if match in i:
        df_8_list.append(i)

len(df_8_list)

3413350

In [42]:
# verify that length of 9 lists is the same as the log file of events. 
len(df_0_list) + len(df_1_list) + len(df_2_list) + len(df_3_list) + len(df_4_list) + len(df_5_list) + len(df_6_list) + len(df_7_list) + len(df_8_list)

11175629

In [43]:
len(hdfs_list)

11175629

## Create the number first dataframe of the first type of event

In [44]:
#df_0_list[0]
len(df_0_list)

3700245

The columns for this event will be:
- first 6 digits (date)
- second 6 digits
- third set of numerical digits
- INFO/WARN
- dfs.... (event type)
- block_id
- Block_asterisk


In [45]:
word_count_0 = {}
for i in df_0_list:
    words = i.split()
    for word in words:
        if word in word_count_0:
            word_count_0[word] += 1
        else:
            word_count_0[word] = 1

#word_count_0

In [46]:
word_count_0['WARN']/len(df_0_list)

0.0002634960658010483

In [47]:
# Words that show up in every event of this type
for key in word_count_0:
    if word_count_0[key] == len(df_0_list):
        print(key)

dfs.FSNamesystem:
BLOCK*


In [48]:
# Creating a list of the different *Block type of events from df_0_list
df_0_blocks_asterisk= []
removing = []
ask = []
for i in df_0_list:
    match = re.findall('BLOCK\* (\S*)', i)
    if len(match) >= 1:
        match = match[0]
        df_0_blocks_asterisk.append(match)


In [49]:
#unique_blocks_asterisk = list(set(df_0_blocks_asterisk))
#unique_blocks_asterisk
set(df_0_blocks_asterisk)

{'NameSystem.addStoredBlock:',
 'NameSystem.allocateBlock:',
 'NameSystem.delete:',
 'Removing',
 'ask'}

# Create lists of all the columns in the df_0 dataframe

In [50]:
df_0_block_id = []
for i in df_0_list:
    match = re.findall('blk_\S*', i)
    match = match[0]
    df_0_block_id.append(match)
len(df_0_block_id)

3700245

In [51]:
df_0_first_digits = []
for i in df_0_list:
    match = re.findall('\S*', i)
    match = match[0]
    df_0_first_digits.append(match)
len(df_0_first_digits)

3700245

In [52]:
df_0_second_digits = []
for i in df_0_list:
    match = re.findall('\S* (\S*)', i)
    match = match[0]
    df_0_second_digits.append(match)
len(df_0_second_digits)

3700245

In [53]:
df_0_third_digits = []
for i in df_0_list:
    match = re.findall('\S* \S* (\S*)', i)
    match = match[0]
    df_0_third_digits.append(match)
len(df_0_third_digits)

3700245

In [54]:
df_0_caps = []
for i in df_0_list:
    match = re.findall('[A-Z]{4}', i)
    match = match[0]
    df_0_caps.append(match)
len(df_0_caps)

3700245

In [55]:
dfs_0 = []
for i in df_0_list:
    match = re.findall('dfs\.\S*:', i)
    match = match[0]
    dfs_0.append(match)

len(dfs_0)

3700245

In [56]:
block_id_0 = []
for i in df_0_list:
    match = re.findall('blk_\S*', i)
    match = match[0]
    block_id_0.append(match)

In [57]:
dict_0 = {'block_id': block_id_0, 
        'first_digits': df_0_first_digits,
        'second_digits': df_0_second_digits,
        'third_digits': df_0_third_digits,
        'message': df_0_caps,
        'event_type': dfs_0,
        'block_asterisk': df_0_blocks_asterisk}

In [58]:
df_0 = pd.DataFrame(dict_0)

In [59]:
df_0.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk
0,blk_-1608999687919862906,81109,203518,35,INFO,dfs.FSNamesystem:,NameSystem.allocateBlock:
1,blk_-1608999687919862906,81109,203519,29,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:
2,blk_-1608999687919862906,81109,203519,30,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:
3,blk_-1608999687919862906,81109,203519,31,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:
4,blk_7503483334202473044,81109,203520,26,INFO,dfs.FSNamesystem:,NameSystem.allocateBlock:


## Creating the dataframe for df_1 type events

In [60]:
df_1_list[0]

'081109 203552 13 INFO dfs.DataBlockScanner: Verification succeeded for blk_-9073992586687739851\n'

In [61]:
#count words in df_1_list
word_count_1 = {}
for i in df_1_list:
    words = i.split()
    for word in words:
        if word in word_count_1:
            word_count_1[word] += 1
        else:
            word_count_1[word] = 1

In [62]:
len(df_1_list)

120046

In [63]:
# words in all of df_1_list events
for key in word_count_1:
    if word_count_1[key] > 100000 :
        print(key)

13
INFO
dfs.DataBlockScanner:
Verification
succeeded
for


In [64]:
# alternatives to 'Verification'
other_list = []
for i in df_1_list:
    if 'Verification' not in i:
        other_list.append(i)


In [65]:
other_list

['081111 043936 19784 WARN dfs.DataBlockScanner: Adding an already existing block blk_-2074647664485597823\n',
 '081111 043936 20189 WARN dfs.DataBlockScanner: Adding an already existing block blk_-2074647664485597823\n',
 '081111 043955 19904 WARN dfs.DataBlockScanner: Adding an already existing block blk_-7052911997539087826\n',
 '081111 043956 15685 WARN dfs.DataBlockScanner: Adding an already existing block blk_-7052911997539087826\n',
 '081111 070155 22669 WARN dfs.DataBlockScanner: Adding an already existing block blk_5697572983288390847\n',
 '081111 070155 22810 WARN dfs.DataBlockScanner: Adding an already existing block blk_5697572983288390847\n',
 '081111 073825 23413 WARN dfs.DataBlockScanner: Adding an already existing block blk_9176263204797010146\n',
 '081111 073825 23432 WARN dfs.DataBlockScanner: Adding an already existing block blk_9176263204797010146\n',
 '081111 073835 23186 WARN dfs.DataBlockScanner: Adding an already existing block blk_6647747942787068087\n',
 '0811

In [66]:
ver_list = []
for i in df_1_list:
    if 'Verification' in i:
        ver_list.append(i)

In [67]:
len(other_list) + len(ver_list) == len(df_1_list)

True

The columns for the df_1 dataframe will be:
- block_id
- first 6 digits (date)
- second 6 digits
- third set of numerical digits
- INFO/WARN
- dfs.... (event type)
- block_id
- ver_add (verification or adding)

In [68]:
# Creating the column lists

df_1_block_id = []
df_1_first_digits = []
df_1_second_digits = []
df_1_third_digits = []
df_1_caps = []
dfs_1 = []
add_ver_1 = []

for i in df_1_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_1_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_1_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_1_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_1_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_1_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_1.append(dfs)

    add_ver = re.findall('dfs\.\S*: (\S*)', i)
    add_ver = add_ver[0]
    add_ver_1.append(add_ver)

In [69]:
dict_1 = {'block_id': df_1_block_id, 
        'first_digits': df_1_first_digits,
        'second_digits': df_1_second_digits,
        'third_digits': df_1_third_digits,
        'message': df_1_caps,
        'event_type': dfs_1,
        'add_ver': add_ver_1}

df_1 = pd.DataFrame(dict_1)
df_1.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,add_ver
0,blk_-9073992586687739851,81109,203552,13,INFO,dfs.DataBlockScanner:,Verification
1,blk_-2917825689581470793,81109,203709,13,INFO,dfs.DataBlockScanner:,Verification
2,blk_-6370470857048627387,81109,203712,13,INFO,dfs.DataBlockScanner:,Verification
3,blk_-1470784088028862059,81109,203716,13,INFO,dfs.DataBlockScanner:,Verification
4,blk_-1548346834251221499,81109,203719,13,INFO,dfs.DataBlockScanner:,Verification


## Creating the dataframe for dfs_2 event type

In [70]:
df_2_list[0]

'081109 204524 19 INFO dfs.FSDataset: Deleting block blk_-8213344449220111733 file /mnt/hadoop/dfs/data/current/subdir39/blk_-8213344449220111733\n'

In [71]:
#count words in df_2_list
word_count_2 = {}
for i in df_2_list:
    words = i.split()
    for word in words:
        if word in word_count_2:
            word_count_2[word] += 1
        else:
            word_count_2[word] = 1

In [72]:
# words in all of df_2_list events
for key in word_count_2:
    if word_count_2[key] > 1000000:
        print(key)

19
INFO
dfs.FSDataset:
Deleting
block
file


In [73]:
other = []
for i in df_2_list:
    if 'Deleting' not in i:
        other.append(i)

In [74]:
count = 0
for i in other:
    if 'Unexpected' not in i:
        print(i)

081110 014606 5794 INFO dfs.FSDataset: Reopen Block blk_7008279672769077211

081111 043955 15685 INFO dfs.FSDataset: Reopen Block blk_-7052911997539087826

081111 043955 19904 INFO dfs.FSDataset: Reopen Block blk_-7052911997539087826

081111 071032 22899 INFO dfs.FSDataset: Reopen Block blk_8006271611835981128

081111 071111 22949 INFO dfs.FSDataset: Reopen Block blk_-6363674043695218814



In [75]:
len(df_2_list)

1407597

In [76]:
# first words after dfs...
first_word_list = []
for i in df_2_list:
    first_word = re.findall('dfs\.\S*: (\S*)', i)
    first_word = first_word[0]
    first_word_list.append(first_word)

set(first_word_list)

{'Deleting', 'Reopen', 'Unexpected'}

In [77]:
# Creating the column lists

df_2_block_id = []
df_2_first_digits = []
df_2_second_digits = []
df_2_third_digits = []
df_2_caps = []
dfs_2 = []
dru_2 = []

for i in df_2_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_2_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_2_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_2_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_2_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_2_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_2.append(dfs)

    dru = re.findall('dfs\.\S*: (\S*)', i)
    dru = dru[0]
    dru_2.append(dru)

In [78]:
set(dru_2)

{'Deleting', 'Reopen', 'Unexpected'}

In [79]:
dict_2 = {'block_id': df_2_block_id, 
        'first_digits': df_2_first_digits,
        'second_digits': df_2_second_digits,
        'third_digits': df_2_third_digits,
        'message': df_2_caps,
        'event_type': dfs_2,
        'dru': dru_2}

df_2 = pd.DataFrame(dict_2)
df_2.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,dru
0,blk_-8213344449220111733,81109,204524,19,INFO,dfs.FSDataset:,Deleting
1,blk_-6899869435641005946,81109,204600,19,INFO,dfs.FSDataset:,Deleting
2,blk_-8191677345482862686,81109,204603,19,INFO,dfs.FSDataset:,Deleting
3,blk_-919439116365725304,81109,204604,19,INFO,dfs.FSDataset:,Deleting
4,blk_3557914126063085372,81109,204623,19,INFO,dfs.FSDataset:,Deleting


## Create df_3 dataframe

all words are the same after the event type for this list, so we will omit that as a column.

In [80]:
df_3_list[0]

'081110 104936 16 WARN dfs.PendingReplicationBlocks$PendingReplicationMonitor: PendingReplicationMonitor timed out block blk_-5057834626410636236\n'

In [81]:
# Creating the column lists

df_3_block_id = []
df_3_first_digits = []
df_3_second_digits = []
df_3_third_digits = []
df_3_caps = []
dfs_3 = []

for i in df_3_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_3_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_3_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_3_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_3_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_3_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_3.append(dfs)

In [82]:
dict_3 = {'block_id': df_3_block_id, 
        'first_digits': df_3_first_digits,
        'second_digits': df_3_second_digits,
        'third_digits': df_3_third_digits,
        'message': df_3_caps,
        'event_type': dfs_3,}

df_3 = pd.DataFrame(dict_3)
df_3.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type
0,blk_-5057834626410636236,81110,104936,16,WARN,dfs.PendingReplicationBlocks$PendingReplicatio...
1,blk_-2526833798441848968,81110,212435,16,WARN,dfs.PendingReplicationBlocks$PendingReplicatio...
2,blk_-2610256770731596149,81110,212435,16,WARN,dfs.PendingReplicationBlocks$PendingReplicatio...
3,blk_-6538891058531862642,81110,212435,16,WARN,dfs.PendingReplicationBlocks$PendingReplicatio...
4,blk_-6836377952319775628,81110,212435,16,WARN,dfs.PendingReplicationBlocks$PendingReplicatio...


## Create df4 dataframe

In [83]:
df_4_list[0:10]

['081109 203521 147 INFO dfs.DataNode$DataTransfer: 10.250.14.224:50010:Transmitted block blk_-1608999687919862906 to /10.251.215.16:50010\n',
 '081109 203526 152 INFO dfs.DataNode$DataTransfer: 10.251.215.16:50010:Transmitted block blk_-1608999687919862906 to /10.251.74.79:50010\n',
 '081109 203527 148 INFO dfs.DataNode$DataTransfer: 10.251.107.19:50010:Transmitted block blk_-1608999687919862906 to /10.251.31.5:50010\n',
 '081109 203532 161 INFO dfs.DataNode$DataTransfer: 10.251.31.5:50010:Transmitted block blk_-1608999687919862906 to /10.251.90.64:50010\n',
 '081109 204522 528 INFO dfs.DataNode$DataTransfer: 10.251.43.21:50010:Transmitted block blk_-8213344449220111733 to /10.251.30.134:50010\n',
 '081109 204557 537 INFO dfs.DataNode$DataTransfer: 10.251.39.64:50010:Transmitted block blk_-6899869435641005946 to /10.250.7.230:50010\n',
 '081109 204600 560 INFO dfs.DataNode$DataTransfer: 10.251.123.20:50010:Transmitted block blk_-8191677345482862686 to /10.251.202.209:50010\n',
 '08110

In [84]:
#count words in df_4_list
word_count_4 = {}
for i in df_4_list:
    words = i.split()
    for word in words:
        if word in word_count_4:
            word_count_4[word] += 1
        else:
            word_count_4[word] = 1

In [85]:
word = []
for i in df_4_list:
    match = re.findall('dfs\.\S*: [\d(.)]*:\d*:(\S*)', i)
    match = match[0]
    word.append(match)

In [86]:
set(word)

{'Failed', 'Transmitted'}

In [87]:
# Creating the column lists

df_4_block_id = []
df_4_first_digits = []
df_4_second_digits = []
df_4_third_digits = []
df_4_caps = []
dfs_4 = []
df_4_trans_fail = []

for i in df_4_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_4_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_4_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_4_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_4_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_4_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_4.append(dfs)

    trans_fail = re.findall('dfs\.\S*: [\d(.)]*:\d*:(\S*)', i)
    trans_fail = trans_fail[0]
    df_4_trans_fail.append(trans_fail)

In [88]:
dict_4 = {'block_id': df_4_block_id, 
        'first_digits': df_4_first_digits,
        'second_digits': df_4_second_digits,
        'third_digits': df_4_third_digits,
        'message': df_4_caps,
        'event_type': dfs_4,
        'trans_fail': df_4_trans_fail}

df_4 = pd.DataFrame(dict_4)
df_4.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,trans_fail
0,blk_-1608999687919862906,81109,203521,147,INFO,dfs.DataNode$DataTransfer:,Transmitted
1,blk_-1608999687919862906,81109,203526,152,INFO,dfs.DataNode$DataTransfer:,Transmitted
2,blk_-1608999687919862906,81109,203527,148,INFO,dfs.DataNode$DataTransfer:,Transmitted
3,blk_-1608999687919862906,81109,203532,161,INFO,dfs.DataNode$DataTransfer:,Transmitted
4,blk_-8213344449220111733,81109,204522,528,INFO,dfs.DataNode$DataTransfer:,Transmitted


## Create dataframe for df_5

All events have the same format, so we don't need to add a column passed the event type

In [89]:
df_5_list[0]

'081109 203521 19 INFO dfs.DataNode: 10.250.14.224:50010 Starting thread to transfer block blk_-1608999687919862906 to 10.251.215.16:50010, 10.251.71.193:50010\n'

In [90]:
#count words in df_5_list
word_count_5 = {}
for i in df_5_list:
    words = i.split()
    for word in words:
        if word in word_count_5:
            word_count_5[word] += 1
        else:
            word_count_5[word] = 1


In [91]:
for key in word_count_5:
    if word_count_5[key] >= len(df_5_list):
        print(key)

INFO
dfs.DataNode:
Starting
thread
to
transfer
block


In [92]:
# Creating the column lists

df_5_block_id = []
df_5_first_digits = []
df_5_second_digits = []
df_5_third_digits = []
df_5_caps = []
dfs_5 = []

for i in df_5_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_5_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_5_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_5_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_5_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_5_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_5.append(dfs)

In [93]:
dict_5 = {'block_id': df_5_block_id, 
        'first_digits': df_5_first_digits,
        'second_digits': df_5_second_digits,
        'third_digits': df_5_third_digits,
        'message': df_5_caps,
        'event_type': dfs_5}

df_5 = pd.DataFrame(dict_5)
df_5.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type
0,blk_-1608999687919862906,81109,203521,19,INFO,dfs.DataNode:
1,blk_-1608999687919862906,81109,203526,19,INFO,dfs.DataNode:
2,blk_-1608999687919862906,81109,203527,19,INFO,dfs.DataNode:
3,blk_-1608999687919862906,81109,203531,19,INFO,dfs.DataNode:
4,blk_-8213344449220111733,81109,204521,19,INFO,dfs.DataNode:


## Create dataframe for df_6 events

In [94]:
df_6_list[0]

'081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010\n'

In [95]:
len(df_6_list)

2518678

In [96]:
info = []
warn = []
for i in df_6_list:
    if 'INFO' in i:
        info.append(i)
    if 'WARN' in i:
        warn.append(i)
len(info) + len(warn) == len(df_6_list)

True

In [97]:
info[0]

'081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010\n'

In [98]:
warn[0]

'081109 213849 2471 WARN dfs.DataNode$DataXceiver: 10.251.71.68:50010:Got exception while serving blk_-8781759536960110370 to /10.250.17.225:\n'

In [99]:
count = 0
for i in info:
    if "Receiving" in i:
        count += 1
count

1723232

In [100]:
count = 0
for i in warn:
    if "exception" in i:
        count += 1
count

356207

In [101]:
first = []
for i in info:
    match = re.findall('dfs\.\S*: [\d(.): ]*([A-Za-z]*)', i)
    match = match[0]
    first.append(match)
set(first)

{'Received', 'Receiving', 'Served', 'writeBlock'}

In [122]:
# Creating the column lists

df_6_block_id = []
df_6_first_digits = []
df_6_second_digits = []
df_6_third_digits = []
df_6_caps = []
dfs_6 = []
df_6_rrswe = []

for i in df_6_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_6_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_6_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_6_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_6_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_6_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_6.append(dfs)

    rrswe = re.findall('dfs\.\S*: [\d(.): Got]*([A-Za-z]*)', i)
    rrswe = rrswe[0]
    df_6_rrswe.append(rrswe)

In [123]:
dict_6 = {'block_id': df_6_block_id, 
        'first_digits': df_6_first_digits,
        'second_digits': df_6_second_digits,
        'third_digits': df_6_third_digits,
        'message': df_6_caps,
        'event_type': dfs_6,
        'rrswe': df_6_rrswe}

df_6 = pd.DataFrame(dict_6)
df_6.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,rrswe
0,blk_-1608999687919862906,81109,203518,143,INFO,dfs.DataNode$DataXceiver:,Receiving
1,blk_-1608999687919862906,81109,203519,143,INFO,dfs.DataNode$DataXceiver:,Receiving
2,blk_-1608999687919862906,81109,203519,145,INFO,dfs.DataNode$DataXceiver:,Receiving
3,blk_7503483334202473044,81109,203520,142,INFO,dfs.DataNode$DataXceiver:,Receiving
4,blk_7503483334202473044,81109,203520,145,INFO,dfs.DataNode$DataXceiver:,Receiving


## Create dataframe for df_7 events

In [130]:
len(df_7_list)

1718

In [131]:
df_7_list[0]

'081109 204530 526 INFO dfs.DataNode$BlockReceiver: Receiving empty packet for block blk_-3842070622043972712\n'

In [132]:
#count words in df_7_list
word_count_7 = {}
for i in df_7_list:
    words = i.split()
    for word in words:
        if word in word_count_7:
            word_count_7[word] += 1
        else:
            word_count_7[word] = 1

In [134]:
for key in word_count_7:
    if word_count_7[key] == len(df_7_list):
        print(key)

INFO
dfs.DataNode$BlockReceiver:


In [157]:
first = []
long = []
exp = []
chan = []
rec = []
jav = []
for i in df_7_list:
    #match = re.findall('dfs\.\S*: [\d(.):]*(\S*)', i)
    match = re.findall('dfs\.\S*: (\S*)', i)
    match = match[0]
    first.append(match)
    if len(match) > 15:
        long.append(i)
    if match == 'Exception':
        exp.append(i)
    if match == 'Changing':
        chan.append(i)
    if match == 'Receiving':
        rec.append(i)
#set(first)

In [155]:
# Creating the column lists

df_7_block_id = []
df_7_first_digits = []
df_7_second_digits = []
df_7_third_digits = []
df_7_caps = []
dfs_7 = []
df_7_type = []

for i in df_7_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_7_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_7_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_7_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_7_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_7_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_7.append(dfs)

    type_7 = re.findall('dfs\.\S*: (\S*)', i)
    type_7 = type_7[0]
    if len(type_7) > 15:
        df_7_type.append('Exception writing')
    elif type_7 == 'Exception':
        type_7_j = re.findall('java\.\S*', i)
        type_7_j = type_7_j[0]
        df_7_type.append(type_7_j)
    else:
        df_7_type.append(type_7)

In [159]:
dict_7 = {'block_id': df_7_block_id, 
        'first_digits': df_7_first_digits,
        'second_digits': df_7_second_digits,
        'third_digits': df_7_third_digits,
        'message': df_7_caps,
        'event_type': dfs_7,
        'cerj': df_7_type}

df_7 = pd.DataFrame(dict_7)
df_7.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,cerj
0,blk_-3842070622043972712,81109,204530,526,INFO,dfs.DataNode$BlockReceiver:,Receiving
1,blk_-3842070622043972712,81109,204530,536,INFO,dfs.DataNode$BlockReceiver:,Receiving
2,blk_-3842070622043972712,81109,204530,543,INFO,dfs.DataNode$BlockReceiver:,Receiving
3,blk_7621437832633701631,81109,204553,525,INFO,dfs.DataNode$BlockReceiver:,Receiving
4,blk_7621437832633701631,81109,204553,532,INFO,dfs.DataNode$BlockReceiver:,Receiving


## Create dataframe for df_8 events

In [168]:
df_8_list[0]

'081109 203519 145 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block blk_-1608999687919862906 terminating\n'

In [171]:
#count words in df_8_list
word_count_8 = {}
for i in df_8_list:
    words = i.split()
    for word in words:
        if word in word_count_8:
            word_count_8[word] += 1
        else:
            word_count_8[word] = 1

In [172]:
for key in word_count_8:
    if word_count_8[key] == len(df_8_list):
        print(key)

INFO
dfs.DataNode$PacketResponder:


In [175]:
pack = []
rec = []
for i in df_8_list:
    first_word = re.findall('dfs\.\S*: (\S*)', i)
    first_word = first_word[0]
    if first_word == 'PacketResponder':
        pack.append(i)
    else:
        rec.append(i)

In [180]:
# Creating the column lists

df_8_block_id = []
df_8_first_digits = []
df_8_second_digits = []
df_8_third_digits = []
df_8_caps = []
dfs_8 = []
df_8_type = []

for i in df_8_list:
    block_id = re.findall('blk_\S*', i)
    block_id = block_id[0]
    df_8_block_id.append(block_id)

    first = re.findall('\S*', i)
    first = first[0]
    df_8_first_digits.append(first)

    second = re.findall('\S* (\S*)', i)
    second = second[0]
    df_8_second_digits.append(second)

    third = re.findall('\S* \S* (\S*)', i)
    third = third[0]
    df_8_third_digits.append(third)

    caps = re.findall('[A-Z]{4}', i)
    caps = caps[0]
    df_8_caps.append(caps)

    dfs = re.findall('dfs\.\S*:', i)
    dfs = dfs[0]
    dfs_8.append(dfs)

    type_8 = re.findall('dfs\.\S*: (\S*)', i)
    type_8 = type_8[0]
    df_8_type.append(type_8)

In [181]:
dict_8 = {'block_id': df_8_block_id, 
        'first_digits': df_8_first_digits,
        'second_digits': df_8_second_digits,
        'third_digits': df_8_third_digits,
        'message': df_8_caps,
        'event_type': dfs_8,
        'pack_rec': df_8_type}

df_8 = pd.DataFrame(dict_8)
df_8.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,pack_rec
0,blk_-1608999687919862906,81109,203519,145,INFO,dfs.DataNode$PacketResponder:,PacketResponder
1,blk_-1608999687919862906,81109,203519,145,INFO,dfs.DataNode$PacketResponder:,PacketResponder
2,blk_-1608999687919862906,81109,203519,145,INFO,dfs.DataNode$PacketResponder:,Received
3,blk_-1608999687919862906,81109,203519,145,INFO,dfs.DataNode$PacketResponder:,Received
4,blk_-1608999687919862906,81109,203519,147,INFO,dfs.DataNode$PacketResponder:,PacketResponder


In [192]:
df_8.to_csv('C:\\Users\conno\MDS-labs\data586\Project\DATA586_project\string_dfs\df_8_string.csv')

In [193]:
hdfs.close()