All the datasets were downloaded from [LogHub's Github page](https://github.com/logpai/loghub)

In [2]:
import pandas as pd
import re
import os
import random

In [3]:
def log_to_dataframe(log_file, regex, headers):
    """ Function to transform log file to dataframe
    """
    log_messages = []
    with open(log_file, 'r', errors='ignore') as fin:
        for line in fin.readlines():
            try:
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
            except Exception as e:
                # print("\n", line)
                # print(e)
                pass
    logdf = pd.DataFrame(log_messages, columns=headers)
    return logdf

In [4]:
def generate_logformat_regex(logformat):
    """ Function to generate regular expression to split log messages
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

In [5]:
def load_data(path, logName, log_format):
    headers, regex = generate_logformat_regex(log_format)
    df_log = log_to_dataframe(os.path.join(path, logName), regex, headers)
    return df_log

In [6]:
def sample(df: pd.DataFrame, k: int):
    nrows = range(df.shape[0])
    ix = random.randint(nrows.start, nrows.stop - k)
    return df.iloc[ix:(ix + k), :].reset_index(drop=True)

In [36]:
data_path = '../data/' # Path of the input *.log files and output *.csv files

# BGL

In [37]:
bgl_format = '<Label> <Id> <Date> <Code1> <timestamp> <Code2> <Component1> <Component2> <Level> <Payload>'
bgl_df = load_data(data_path, 'BGL.log', bgl_format)

# Select sequence of size k randomly
#bgl_df = sample(bgl_df, 2000)
bgl_df.head()

Unnamed: 0,Label,Id,Date,Code1,timestamp,Code2,Component1,Component2,Level,Payload
0,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
1,-,1117838573,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.53.276129,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
2,-,1117838976,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.49.36.156884,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
3,-,1117838978,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.49.38.026704,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
4,-,1117842440,2005.06.03,R23-M0-NE-C:J05-U01,2005-06-03-16.47.20.730545,R23-M0-NE-C:J05-U01,RAS,KERNEL,INFO,63543 double-hummer alignment exceptions


In [38]:
# Transform timestamps
bgl_ts_format = "%Y-%m-%d-%H.%M.%S.%f"
bgl_df['timestamp'] = pd.to_datetime(bgl_df['timestamp'], format=bgl_ts_format)

# Transform labels to binary
bgl_df['Label'] = bgl_df['Label'].apply(lambda x: '0' if x == '-' else '1')
bgl_df.head()

Unnamed: 0,Label,Id,Date,Code1,timestamp,Code2,Component1,Component2,Level,Payload
0,0,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
1,0,1117838573,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:42:53.276129,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
2,0,1117838976,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:49:36.156884,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
3,0,1117838978,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:49:38.026704,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
4,0,1117842440,2005.06.03,R23-M0-NE-C:J05-U01,2005-06-03 16:47:20.730545,R23-M0-NE-C:J05-U01,RAS,KERNEL,INFO,63543 double-hummer alignment exceptions


In [39]:
bgl_df.groupby(['Label'])['Label'].count().rename({'1': 'Anomaly', '0': 'Normal'})

Label
Normal     1857
Anomaly     143
Name: Label, dtype: int64

In [40]:
bgl_df.to_csv(os.path.join(data_path, 'BGL_2k.csv'), index=False)

# HDFS

In [35]:
hdfs_format = '<Date> <Time> <Pid> <Level> <Component>: <Payload>'
hdfs_df = load_data(data_path, 'HDFS.log', hdfs_format)
hdfs_df['timestamp'] = hdfs_df['Date'] + '-' + hdfs_df['Time']
hdfs_ts_format = '%y%m%d-%H%M%S'
hdfs_df['timestamp'] = pd.to_datetime(hdfs_df['timestamp'], format=hdfs_ts_format)
# Select sequence of size k randomly
#hdfs_df = sample(hdfs_df, 2000)
hdfs_df.to_csv('HDFS_2k.csv', index=False)
hdfs_df.head()


Unnamed: 0,Date,Time,Pid,Level,Component,Payload,timestamp
0,81109,203615,148,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_38865049064139...,2008-11-09 20:36:15
1,81109,203807,222,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_-6952295868487...,2008-11-09 20:38:07
2,81109,204005,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.addStoredBlock: blockMap upd...,2008-11-09 20:40:05
3,81109,204015,308,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_82291938032499...,2008-11-09 20:40:15
4,81109,204106,329,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_-6670958622368...,2008-11-09 20:41:06


In [28]:
#hdfs_df = pd.read_csv('../data/HDFS.csv')

# Since date and time are separate, we need to combine them into a timestamp
hdfs_df['timestamp'] = hdfs_df['Date'] + '-' + hdfs_df['Time']
hdfs_ts_format = '%y%m%d-%H%M%S'
hdfs_df['timestamp'] = pd.to_datetime(hdfs_df['timestamp'], format=hdfs_ts_format)

# Label information is also missing so we need add that
anomaly_labels = pd.read_csv(os.path.join(data_path, 'anomaly_label.csv'))
def anomaly_classification(payload):
    blkId_list = re.findall(r'(blk_-?\d+)', payload)
    blkId_set = list(set(blkId_list))
    if len(blkId_set) != 1: # This shouldn't happen
        raise ValueError(f"Row  has {len(blkId_set)} blkIds. Cannot determine if anomaly or not")
    blkId = blkId_set[0]
    is_anomaly = anomaly_labels.loc[anomaly_labels['BlockId'] == blkId, 'Label'].tolist()[0] == 'Anomaly'
    return '1' if is_anomaly else '0'

#hdfs_df['Label'] = hdfs_df['Payload'].apply(anomaly_classification)
hdfs_df.head()

Unnamed: 0,Date,Time,Pid,Level,Component,Payload,timestamp
0,81109,203615,148,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_38865049064139...,2008-11-09 20:36:15
1,81109,203807,222,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_-6952295868487...,2008-11-09 20:38:07
2,81109,204005,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.addStoredBlock: blockMap upd...,2008-11-09 20:40:05
3,81109,204015,308,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_82291938032499...,2008-11-09 20:40:15
4,81109,204106,329,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_-6670958622368...,2008-11-09 20:41:06


In [None]:
hdfs_df.groupby(['Label'])['Label'].count().rename({'1': 'Anomaly', '0': 'Normal'})

In [None]:
hdfs_df.to_csv(os.path.join(data_path, 'HDFS_2k.csv'), index=False)

# Mozilla Thunderbird

In [14]:
tbird_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Payload>'

# The Thunderbird log file is really huge so I'm just going to use the Github's
# 2k sample
tbird_df = load_data(data_path, 'Thunderbird_2k.log', tbird_format)

# Select sequence of size k randomly
#tbird_df = sample(tbird_df, 20000)
tbird_df.head()

Unnamed: 0,Label,Id,Date,Admin,Month,Day,Time,AdminAddr,Payload
0,-,1131566461,2005.11.09,dn228,Nov,9,12:01:01,dn228/dn228,crond(pam_unix)[2915]: session closed for user...
1,-,1131566461,2005.11.09,dn228,Nov,9,12:01:01,dn228/dn228,crond(pam_unix)[2915]: session opened for user...
2,-,1131566461,2005.11.09,dn228,Nov,9,12:01:01,dn228/dn228,crond[2916]: (root) CMD (run-parts /etc/cron.h...
3,-,1131566461,2005.11.09,dn261,Nov,9,12:01:01,dn261/dn261,crond(pam_unix)[2907]: session closed for user...
4,-,1131566461,2005.11.09,dn261,Nov,9,12:01:01,dn261/dn261,crond(pam_unix)[2907]: session opened for user...


In [15]:
# Transform timestamps
tbird_df['timestamp'] = tbird_df['Date'] + '-' + tbird_df['Time']
tbird_ts_format = '%Y.%m.%d-%H:%M:%S'
tbird_df['timestamp'] = pd.to_datetime(tbird_df['timestamp'], format=tbird_ts_format)

# Transform labels
tbird_df['Label'] = tbird_df['Label'].apply(lambda x: '0' if x == '-' else '1')
tbird_df.head()

Unnamed: 0,Label,Id,Date,Admin,Month,Day,Time,AdminAddr,Payload,timestamp
0,0,1131566461,2005.11.09,dn228,Nov,9,12:01:01,dn228/dn228,crond(pam_unix)[2915]: session closed for user...,2005-11-09 12:01:01
1,0,1131566461,2005.11.09,dn228,Nov,9,12:01:01,dn228/dn228,crond(pam_unix)[2915]: session opened for user...,2005-11-09 12:01:01
2,0,1131566461,2005.11.09,dn228,Nov,9,12:01:01,dn228/dn228,crond[2916]: (root) CMD (run-parts /etc/cron.h...,2005-11-09 12:01:01
3,0,1131566461,2005.11.09,dn261,Nov,9,12:01:01,dn261/dn261,crond(pam_unix)[2907]: session closed for user...,2005-11-09 12:01:01
4,0,1131566461,2005.11.09,dn261,Nov,9,12:01:01,dn261/dn261,crond(pam_unix)[2907]: session opened for user...,2005-11-09 12:01:01


In [16]:
tbird_df.groupby(['Label'])['Label'].count().rename({'1': 'Anomaly', '0': 'Normal'})

Label
Normal    2000
Name: Label, dtype: int64

In [17]:
tbird_df.to_csv(os.path.join(data_path, 'Thunderbird_2k.csv'), index=False)