In [1]:
import os
import pandas as pd
import re
# Supressing the warning messages
import warnings
from datetime import datetime
from dotenv import load_dotenv
from tqdm import tqdm

warnings.filterwarnings('ignore')

load_dotenv()

BASE_PATH = os.getenv('BASE_PATH')

traces = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'Event_traces.csv'))
labels = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'anomaly_label.csv'))
log_templates = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'HDFS.log_templates.csv'))
log_templates['Regex'] = log_templates['EventTemplate'].apply(
    lambda t: re.compile(re.escape(t).replace(r'\[\*\]', '.*')))


def map_log_to_event(log_line, templates):
    for _, row in templates.iterrows():
        if row['Regex'].match(log_line):
            return row['EventId']
    return None


def extract_block_id(log_line):
    match = re.search(r'blk_-?\d+', log_line)
    if match:
        return match.group(0)
    return None


log_file_path = os.path.expanduser(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'HDFS.log'))

if not os.path.exists(log_file_path):
    raise FileNotFoundError(f"No such file or directory: '{log_file_path}'")

timestamps = []
event_ids = []

pickle_file = os.path.join("variables", "data_frame.pkl")
df = None
if not os.path.exists(pickle_file):
    with open(log_file_path, 'r') as file:
        for line in tqdm(file, desc="Processing logs", unit="log"):
            timestamp_match = re.match(r'(\d{6}) (\d{6}) \d+', line)
            if timestamp_match:
                date_str, time_str = timestamp_match.groups()
                timestamp = datetime.strptime(date_str + time_str, "%y%m%d%H%M%S")
            else:
                timestamp = None
            event_id = map_log_to_event(line, log_templates)
            timestamps.append(timestamp)
            event_ids.append(event_id)

    df = pd.DataFrame(
        {
            'Timestamp': timestamps,
            'EventId': event_ids,
        }
    )
    df.to_pickle(pickle_file)
else:
    df = pd.read_pickle(pickle_file)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11175629 entries, 0 to 11175628
Data columns (total 2 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Timestamp  datetime64[ns]
 1   EventId    object        
dtypes: datetime64[ns](1), object(1)
memory usage: 170.5+ MB


In [4]:
df.head()

Unnamed: 0,Timestamp,EventId
0,2008-11-09 20:35:18,E5
1,2008-11-09 20:35:18,E22
2,2008-11-09 20:35:19,E5
3,2008-11-09 20:35:19,E5
4,2008-11-09 20:35:19,E11


In [None]:
error_events = ['E4', 'E7', '']

df['Label'] = 