In [8]:
import os

from dotenv import load_dotenv

load_dotenv()

BASE_PATH = os.getenv('BASE_PATH')

import pandas as pd

traces = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'Event_traces.csv'))
labels = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'anomaly_label.csv'))

data = traces.merge(labels, on='BlockId')

data = data[['Features', 'Label_x']]
data.rename(columns={'Label_x': 'Label'}, inplace=True)
data['Label'] = data['Label'].apply(lambda x: 1 if x == 'Fail' else 0)

print(data.head())


                                            Features  Label
0  [E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...      0
1  [E5,E22,E5,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
2  [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
3  [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...      0
4  [E5,E22,E5,E5,E11,E11,E9,E9,E11,E9,E26,E26,E26...      0


In [9]:
from sklearn.model_selection import train_test_split

x = data['Features']
y = data['Label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {len(x_train)}")
print(f"\nTraining sample: \n{x_train.head()}")
print(f"\nTest set size: {len(x_test)}")
print(f"\nTesting sample: \n{x_test.head()}")


Training set size: 460047

Training sample: 
8228      [E22,E5,E5,E5,E26,E26,E26,E11,E9,E11,E9,E11,E9...
503227      [E5,E5,E5,E22,E11,E9,E11,E9,E11,E9,E26,E26,E26]
179673    [E5,E5,E5,E22,E11,E9,E11,E9,E11,E9,E26,E26,E26...
106452    [E22,E5,E5,E5,E26,E26,E26,E11,E9,E11,E9,E11,E9...
231195    [E5,E5,E5,E22,E11,E9,E11,E9,E11,E9,E26,E26,E26...
Name: Features, dtype: object

Test set size: 115012

Testing sample: 
309014    [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...
332401    [E5,E5,E5,E22,E11,E9,E11,E9,E26,E11,E9,E26,E26...
303661    [E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...
350657      [E5,E22,E5,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26]
425054    [E22,E5,E5,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26...
Name: Features, dtype: object


In [10]:
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense

log_templates = pd.read_csv(os.path.join(BASE_PATH, 'Raw_logs', 'HDFS_v1', 'preprocessed', 'HDFS.log_templates.csv'))

MAX_LEN = 50
VOCAB_SIZE = len(log_templates['EventId'].unique()) + 1

vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,  # Set your desired vocabulary size
    output_mode='int',
    output_sequence_length=MAX_LEN  # Set your desired sequence length
)

vectorize_layer.adapt(log_templates['EventId'])

x_train_vectorized = vectorize_layer(x_train)
x_test_vectorized = vectorize_layer(x_test)

print("Sample Encoded Sequence: ", x_train_vectorized[0])

Sample Encoded Sequence:  tf.Tensor(
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(50,), dtype=int64)


In [11]:
from tensorflow.keras.models import Sequential

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(1000, embedding_vector_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

None


In [6]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('models/lstm_model_best.keras', monitor='val_accuracy', save_best_only=True, verbose=1)

# Train the LSTM model
model.fit(x_train_vectorized, y_train, epochs=3, batch_size=32, validation_data=(x_test_vectorized, y_test), callbacks=[checkpoint])



Epoch 1/3


2025-04-05 18:31:46.123885: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9706 - loss: 0.1357
Epoch 1: val_accuracy improved from -inf to 0.97137, saving model to models/lstm_model_best.h5




[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 13ms/step - accuracy: 0.9706 - loss: 0.1357 - val_accuracy: 0.9714 - val_loss: 0.1302
Epoch 2/3
[1m14375/14377[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9701 - loss: 0.1345
Epoch 2: val_accuracy did not improve from 0.97137
[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 13ms/step - accuracy: 0.9701 - loss: 0.1345 - val_accuracy: 0.9714 - val_loss: 0.1301
Epoch 3/3
[1m14374/14377[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9707 - loss: 0.1325
Epoch 3: val_accuracy did not improve from 0.97137
[1m14377/14377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 13ms/step - accuracy: 0.9707 - loss: 0.1325 - val_accuracy: 0.9714 - val_loss: 0.1300


<keras.src.callbacks.history.History at 0x3213e3890>