# HDFS Anomaly Detection using LSTM

## Importing data 

In [11]:
import os

from dotenv import load_dotenv

load_dotenv()

BASE_PATH = os.getenv('BASE_PATH')

import pandas as pd

traces = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'Event_traces.csv'))
labels = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'anomaly_label.csv'))
log_templates = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'HDFS.log_templates.csv'))

data = traces.merge(labels, on='BlockId')

data = data[['Features', 'Label_x']]
data.rename(columns={'Label_x': 'Label'}, inplace=True)
data['Label'] = data['Label'].apply(lambda x: 1 if x == 'Fail' else 0)

print(data.head())


                                            Features  Label
0  [E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...      0
1  [E5,E22,E5,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
2  [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
3  [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
4  [E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...      0


## Sliding Window Creation Function

In [43]:
import numpy as np

def create_sliding_windows(event_sequences, label, window_size_local=10, step_size_local=1):
    x1, y1 = [], []
    for i in range(0, len(event_sequences) - window_size_local, step_size_local):
        x1.append(event_sequences[i : i + window_size_local])
        y1.append(label)
    return np.array(x1), np.array(y1)


## Encoding the sequences

In [44]:
from tensorflow.keras.layers import TextVectorization

MAX_LEN = 50
VOCAB_SIZE = len(log_templates['EventId'].unique()) + 1

vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,  # Set your desired vocabulary size
    output_mode='int',
    output_sequence_length=MAX_LEN  # Set your desired sequence length
)

vectorize_layer.adapt(log_templates['EventId'])

## Applying the window function

In [47]:
from tqdm import tqdm
x_all = []
y_all = []

window_size = 10
step_size = 1

for i in tqdm(range(len(data)), desc="Processing events", unit="log"):
    raw_text = data['Features'][i][1:-1].replace(",", " ")
    x_vectorized = vectorize_layer(raw_text)
    label = data['Label'][i]  
    x_windows, y_windows = create_sliding_windows(x_vectorized, label, window_size, step_size)
    
    x_all.append(x_windows)
    y_all.append(y_windows)

x_train = np.concatenate(x_all, axis=0)
y_train = np.concatenate(y_all, axis=0)

print(f"X_Train {x_train.shape}")
print(f"y_train shape: {y_train.shape}")

Processing events:   1%|▏         | 7473/575059 [02:49<3:34:58, 44.00log/s]


KeyboardInterrupt: 

## Building the LSTM Model

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(1000, embedding_vector_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

None


## Training the LSTM Model

In [6]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('models/lstm_model_best.keras', monitor='val_accuracy', save_best_only=True, verbose=1)

# Train the LSTM model
model.fit(x_train_vectorized, y_train, epochs=3, batch_size=32, validation_data=(x_test_vectorized, y_test), callbacks=[checkpoint])


Epoch 1/3


2025-04-05 18:31:46.123885: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9706 - loss: 0.1357
Epoch 1: val_accuracy improved from -inf to 0.97137, saving model to models/lstm_model_best.h5




[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 13ms/step - accuracy: 0.9706 - loss: 0.1357 - val_accuracy: 0.9714 - val_loss: 0.1302
Epoch 2/3
[1m14375/14377[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9701 - loss: 0.1345
Epoch 2: val_accuracy did not improve from 0.97137
[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 13ms/step - accuracy: 0.9701 - loss: 0.1345 - val_accuracy: 0.9714 - val_loss: 0.1301
Epoch 3/3
[1m14374/14377[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9707 - loss: 0.1325
Epoch 3: val_accuracy did not improve from 0.97137
[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 13ms/step - accuracy: 0.9707 - loss: 0.1325 - val_accuracy: 0.9714 - val_loss: 0.1300


<keras.src.callbacks.history.History at 0x3213e3890>