In [21]:
import pandas as pd
import os
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict
import sys
sys.path.append('../')
from log_parser import Drain

log_file_path = 'data/unstructured/HDFS/'
label_file_name = 'data/unstructured/HDFS/anomaly_label.csv'
unstructured_log_filename = 'HDFS_2k.log'
structured_log_file_path = 'data/structured/HDFS/'
structured_log_filename = 'HDFS_2k.log_structured.csv'


def parseLog(log_file_path, log_file_name, structured_log_file_path, log_type):
    if log_type == 'HDFS':
        log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'

    # Regular expression list for optional preprocessing (default: [])
    regex      = [
        r'blk_(|-)[0-9]+' , # block id
        r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
        r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
    ]
    st         = 0.5  # Similarity threshold
    depth      = 4  # Depth of all leaf nodes

    parser = Drain.LogParser(log_format, indir=log_file_path, outdir=structured_log_file_path,  depth=depth, st=st, rex=regex)
    parser.parse(log_file_name)

## parse the logs - convert unstructured to structured log
parseLog(log_file_path, unstructured_log_filename, structured_log_file_path, 'HDFS')
    

## read structured log 
print("Loading", structured_log_file_path+structured_log_filename)
structured_log = pd.read_csv(structured_log_file_path+structured_log_filename, engine='c', na_filter=False, memory_map=True)



Parsing file: data/unstructured/HDFS/HDFS_2k.log
Processed 50.0% of log lines.
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.328635]
Loading data/structured/HDFS/HDFS_2k.log_structured.csv


In [23]:
# print(struct_log)

# Create a map of BlockId vs the EventSequence. 
data_dict = OrderedDict()

for idx, row in structured_log.iterrows():
    block_ids = set(re.findall(r'(blk_-?\d+)', row['Content']))
    for block in block_ids:
        if not block in data_dict:
            data_dict[block] = []
        data_dict[block].append(row['EventId'])
data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])


# Add anomaly label to the data tagged to block id
label_data = pd.read_csv(label_file_name, engine='c', na_filter=False, memory_map=True)
label_data = label_data.set_index('BlockId')
label_dict = label_data['Label'].to_dict()
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

print(data_df)
data_df.to_csv('data_instances.csv', index=False)





# Split train and test data
# (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, 
#     data_df['Label'].values, train_ratio, split_type)

# print(y_train.sum(), y_test.sum())



# (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(...)




## apply ML to it


                       BlockId EventSequence  Label
0        blk_38865049064139660    [dc2c74b7]      0
1     blk_-6952295868487656571    [dc2c74b7]      0
2      blk_7128370237687728475    [5d5de21c]      0
3      blk_8229193803249955061    [dc2c74b7]      0
4     blk_-6670958622368987959    [dc2c74b7]      0
...                        ...           ...    ...
2195   blk_4198733391373026104    [09a53393]      0
2196  blk_-5815145248455404269    [e3df2680]      0
2197   blk_-295306975763175640    [09a53393]      0
2198   blk_5225719677049010638    [dc2c74b7]      0
2199   blk_4343207286455274569    [09a53393]      0

[2200 rows x 3 columns]


NameError: name '_split_data' is not defined