In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv("/content/cleaned_labeled_logs.csv")

print("Dataset shape:", data.shape)
data.head()


Dataset shape: (1994, 3)


Unnamed: 0,block_id,clean_log,label
0,blk_-1030832046197982436,info dfs.fsnamesystem: block* namesystem.de...,Normal
1,blk_-1046472716157313227,info dfs.fsdataset: deleting block blk_- fi...,Normal
2,blk_-1049340855430710153,info dfs.datanode$packetresponder: packetre...,Normal
3,blk_-1055254430948037872,info dfs.fsnamesystem: block* namesystem.ad...,Normal
4,blk_-1067234447809438340,info dfs.datanode$packetresponder: packetre...,Normal


In [None]:
data["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Normal,1926
Anomaly,68


In [None]:
vectorizer = CountVectorizer(
    max_features=500,
    stop_words='english'
)

X = vectorizer.fit_transform(data["clean_log"])
y = data["label"]


In [None]:
print("Feature matrix shape:", X.shape)


Feature matrix shape: (1994, 51)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
import joblib
import os

# Create directories if they don't exist
os.makedirs("data/processed", exist_ok=True)
os.makedirs("models", exist_ok=True)

joblib.dump(X_train, "data/processed/X_train.pkl")
joblib.dump(X_test, "data/processed/X_test.pkl")
joblib.dump(y_train, "data/processed/y_train.pkl")
joblib.dump(y_test, "data/processed/y_test.pkl")
joblib.dump(vectorizer, "models/vectorizer.pkl")

print("✅ Feature engineering completed and saved.")

✅ Feature engineering completed and saved.
