In [1]:
import pandas as pd
import numpy as np

# --- 1. SET THE PATH TO YOUR FEATURE MATRIX ---
MATRIX_FILE_PATH = 'D:/IoC-Free IDS using ML and NLP/data/raw/HDFS_Logs_Dataset/HDFS_v1/preprocessed/Event_occurrence_matrix.csv'

In [2]:

# --- 2. LOAD THE PRE-ENGINEERED DATASET ---
try:
    df_matrix = pd.read_csv(MATRIX_FILE_PATH)
    print(f"Successfully loaded the feature matrix from '{MATRIX_FILE_PATH}'")
    print(f"Shape of the matrix: {df_matrix.shape}")
except FileNotFoundError as e:
    print(f"Error: Could not find the file. Please check your path. Details: {e}")

print("\nData sample:")
print(df_matrix.head())

Successfully loaded the feature matrix from 'D:/IoC-Free IDS using ML and NLP/data/raw/HDFS_Logs_Dataset/HDFS_v1/preprocessed/Event_occurrence_matrix.csv'
Shape of the matrix: (575061, 32)

Data sample:
                    BlockId    Label  Type  E1  E2   E3  E4  E5  E6  E7  ...  \
0  blk_-1608999687919862906  Success   NaN   0   0  203   0  10   7   0  ...   
1   blk_7503483334202473044  Success   NaN   0   2    1   0   3   0   0  ...   
2  blk_-3544583377289625738     Fail  21.0   0   0  203   0   3   0   0  ...   
3  blk_-9073992586687739851  Success   NaN   0   3    0   0   3   0   0  ...   
4   blk_7854771516489510256  Success   NaN   0   3    1  15   3   0   0  ...   

   E20  E21  E22  E23  E24  E25  E26  E27  E28  E29  
0    0   10    1   10    0    4   10    0    0    0  
1    0    3    1    3    0    0    3    0    0    0  
2    1    3    1    3    0    0    3    0    0    0  
3    0    3    1    3    0    0    3    0    0    0  
4    0    3    1    3    0    0    3    0    0

In [4]:
# --- 3. PREPARE DATA FOR MACHINE LEARNING ---
print("\n--> Preparing data for modeling...")

# a. Separate features (X) from the target label (y)
# The features are all columns from 'E1' to the end.
feature_columns = [col for col in df_matrix.columns if col.startswith('E')]
X = df_matrix[feature_columns]

# The label is in the 'Label' column. It says 'Success' or 'Fail'.
y_text = df_matrix['Label']

# b. Convert the text label to a binary format (0 for normal, 1 for anomaly)
# We will assume 'Success' means Normal (0) and 'Fail' means Anomaly (1).
y = y_text.apply(lambda label: 0 if label == 'Success' else 1)
print("    - Separated features (X) and target (y).")
print("    - Converted text labels to binary (0=Success, 1=Fail).")


--> Preparing data for modeling...
    - Separated features (X) and target (y).
    - Converted text labels to binary (0=Success, 1=Fail).


In [5]:
# --- 4. FINAL VERIFICATION ---
print("\n--- HDFS Data Preparation Complete! ---")
print(f"Shape of feature matrix X: {X.shape}")
print(f"Shape of target vector y: {y.shape}")

print("\nSample of final features (X):")
print(X.head())

print("\nSample of final labels (y) and their original text:")
print(pd.concat([y_text, y], axis=1).head())

print("\nFinal label distribution:")
print(y.value_counts())


--- HDFS Data Preparation Complete! ---
Shape of feature matrix X: (575061, 29)
Shape of target vector y: (575061,)

Sample of final features (X):
   E1  E2   E3  E4  E5  E6  E7  E8  E9  E10  ...  E20  E21  E22  E23  E24  \
0   0   0  203   0  10   7   0   0   3    0  ...    0   10    1   10    0   
1   0   2    1   0   3   0   0   0   3    0  ...    0    3    1    3    0   
2   0   0  203   0   3   0   0   0   3    0  ...    1    3    1    3    0   
3   0   3    0   0   3   0   0   0   3    0  ...    0    3    1    3    0   
4   0   3    1  15   3   0   0   0   3    0  ...    0    3    1    3    0   

   E25  E26  E27  E28  E29  
0    4   10    0    0    0  
1    0    3    0    0    0  
2    0    3    0    0    0  
3    0    3    0    0    0  
4    0    3    0    0    0  

[5 rows x 29 columns]

Sample of final labels (y) and their original text:
     Label  Label
0  Success      0
1  Success      0
2     Fail      1
3  Success      0
4  Success      0

Final label distribution:
Labe

In [6]:
# --- SAVE THE PROCESSED DATA ---
# We can save X and y together or separately. Let's create a final clean DataFrame.
final_hdfs_data = pd.concat([X, y.rename('label')], axis=1)

output_filename = 'D:/IoC-Free IDS using ML and NLP/data/processed/hdfs_v1_feature_matrix_labeled.parquet'
final_hdfs_data.to_parquet(output_filename)
print(f"\n✅ Success! Final HDFS feature matrix has been saved to '{output_filename}'.")


✅ Success! Final HDFS feature matrix has been saved to 'D:/IoC-Free IDS using ML and NLP/data/processed/hdfs_v1_feature_matrix_labeled.parquet'.
