# HDFS Anomaly Detection using LSTM

## Importing data 

In [None]:
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import load_model
from tqdm import tqdm

In [5]:
load_dotenv()

BASE_PATH = os.getenv('BASE_PATH')

traces = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'Event_traces.csv'))
labels = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'anomaly_label.csv'))
log_templates = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'HDFS.log_templates.csv'))

data = traces.merge(labels, on='BlockId')

data = data[['Features', 'Label_x']]
data.rename(columns={'Label_x': 'Label'}, inplace=True)
data['Label'] = data['Label'].apply(lambda x: 1 if x == 'Fail' else 0)

print(data.head())

data.head()


                                            Features  Label
0  [E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...      0
1  [E5,E22,E5,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
2  [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
3  [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
4  [E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...      0


Unnamed: 0,Features,Label
0,"[E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...",0
1,"[E5,E22,E5,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...",0
2,"[E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...",0
3,"[E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...",0
4,"[E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...",0


## Sliding Window Creation Function

In [None]:
def create_sliding_windows(event_sequences, label, window_size_local=10, step_size_local=1):
    x1, y1 = [], []
    for i in range(0, len(event_sequences) - window_size_local, step_size_local):
        x1.append(event_sequences[i: i + window_size_local])
        y1.append(label)
    return np.array(x1), np.array(y1)


## Encoding the sequences

In [None]:
MAX_LEN = 50
VOCAB_SIZE = len(log_templates['EventId'].unique()) + 1

vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,  # Set your desired vocabulary size
    output_mode='int',
    output_sequence_length=MAX_LEN  # Set your desired sequence length
)

vectorize_layer.adapt(log_templates['EventId'])

## Applying the window function

In [None]:
if os.path.exists(os.path.join("variables", "x_train.npy")) and os.path.exists(
        os.path.join("variables", "y_train.npy")):
    x_train = np.load(os.path.join("variables", "x_train.npy"))
    y_train = np.load(os.path.join("variables", "y_train.npy"))
else:
    x_all = []
    y_all = []

    window_size = 10
    step_size = 1

    for i in tqdm(range(len(data)), desc="Processing events", unit="log"):
        raw_text = data['Features'][i][1:-1].replace(",", " ")
        x_vectorized = vectorize_layer(raw_text)
        label = data['Label'][i]
        x_windows, y_windows = create_sliding_windows(x_vectorized, label, window_size, step_size)

        x_all.append(x_windows)
        y_all.append(y_windows)

    x_train = np.concatenate(x_all, axis=0)
    y_train = np.concatenate(y_all, axis=0)

print(f"X_Train {x_train.shape}")
print(f"y_train shape: {y_train.shape}")



In [None]:
os.makedirs("variables", exist_ok=True)
np.save(os.path.join("variables", "x_train.npy"), x_train)
np.save(os.path.join("variables", "y_train.npy"), y_train)

In [None]:
x_train_final, x_test_final, y_train_final, y_test_final = train_test_split(x_train, y_train, test_size=0.2,
                                                                            random_state=42, stratify=y_train)

## Building the LSTM Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vectorize_layer.vocabulary_size(), embedding_vector_length))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

modelDropout = Sequential()
modelDropout.add(Embedding(vectorize_layer.vocabulary_size(), embedding_vector_length))
modelDropout.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
modelDropout.add(Dense(1, activation='sigmoid'))
modelDropout.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

print(model.summary())
print(modelDropout.summary())

## Training the LSTM Model

In [None]:


class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_final),
    y=y_train_final.ravel()
)

# Converter para dicionário:
weights = dict(zip(np.unique(y_train_final), class_weights))

print(weights)

checkpoint = ModelCheckpoint('models/lstm_model_best_dropout.keras', monitor='val_accuracy', save_best_only=True,
                             verbose=1)

model_path = os.path.join('models', 'lstm_model_best_dropout.keras')
if os.path.exists(model_path):
    print('Loading existing model.')
    modelDropout = load_model(model_path)
else:
    print('Model not found, training a new one.')
    # Train the LSTM model
    modelDropout.fit(x_train_final, y_train_final,
                     validation_data=(x_test_final, y_test_final),
                     epochs=3, batch_size=512,
                     class_weight=weights,
                     callbacks=[checkpoint])


# Prediction Stage

In [None]:
text_input = " ".join(event_sequence)
x_vectorized = vectorize_layer(text_input)
x_windows, _ = create_sliding_windows(x_vectorized, None)
predictions = model.predict(x_windows)

In [None]:


plt.plot(predictions)
plt.title("Failure Probability over Time")
plt.xlabel("Window Index")
plt.ylabel("Failure Probability")
plt.show()


In [4]:
BASE_PATH = "~"

log_templates = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'HDFS.log_templates.csv'))
print(log_templates.head())
log_templates['Regex'] = log_templates['EventTemplate'].apply(
    lambda t: re.compile(re.escape(t).replace(r'\[\*\]', '.*')))


def map_log_to_event(log_line):
    for _, row in log_templates.iterrows():
        if row['Regex'].match(log_line):
            return row['EventId']
    return None


log_file_path = os.path.expanduser(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'HDFS.log'))

if not os.path.exists(log_file_path):
    raise FileNotFoundError(f"No such file or directory: '{log_file_path}'")

event_sequence = []
with open(log_file_path, 'r') as file:
    for line in tqdm(file, desc="Processing log lines", unit="line"):
        event_id = map_log_to_event(line.strip())
        if event_id is not None:
            event_sequence.append(event_id)
            

  EventId                           EventTemplate
0      E1  [*]Adding an already existing block[*]
1      E2        [*]Verification succeeded for[*]
2      E3                 [*]Served block[*]to[*]
3      E4  [*]Got exception while serving[*]to[*]
4      E5    [*]Receiving block[*]src:[*]dest:[*]


Processing log lines: 855118line [03:21, 4236.09line/s] 


KeyboardInterrupt: 