In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
import hashlib

# Directories
PROCESSED_DIR = Path("processed")
FEATURES_DIR = Path("features")
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

# Input / Output files
INPUT_FILE = PROCESSED_DIR / "clean_bodmas.pkl"
OUTPUT_FILE_X = FEATURES_DIR / "X_features.pkl"
OUTPUT_FILE_Y = FEATURES_DIR / "y_labels.pkl"

# Load processed dataset
df = pd.read_pickle(INPUT_FILE)
print(f"Loaded clean data. Shape: {df.shape}")




Loaded clean data. Shape: (57293, 2386)


In [8]:
# === Feature Engineering ===
def create_novel_features(df):
    """
    Adds new behavioral features for malware classification.
    """
    epsilon = 1e-6  # avoid division by zero

    # API success ratio example (feature_10 / (feature_10 + feature_11))
    if 'feature_10' in df.columns and 'feature_11' in df.columns:
        df['api_success_ratio'] = df['feature_10'] / (df['feature_10'] + df['feature_11'] + epsilon)

    # Behavior hash from features 20-39
    behavior_cols = [f'feature_{i}' for i in range(20, 40) if f'feature_{i}' in df.columns]
    if behavior_cols:
        df['behavior_hash'] = df[behavior_cols].apply(
            lambda row: hashlib.sha256(row.to_numpy().tobytes()).hexdigest(), axis=1
        )
        # Convert to categorical numeric code
        df['behavior_hash_cat'] = df['behavior_hash'].astype('category').cat.codes

    return df



In [9]:
# Apply feature engineering
df = create_novel_features(df)
print("Feature engineering completed.")

# === Prepare Feature Matrix and Labels ===
original_features = [c for c in df.columns if 'feature_' in c]
numeric_features = [c for c in ['api_success_ratio'] if c in df.columns]
categorical_features = [c for c in ['behavior_hash_cat'] if c in df.columns]

X = df[original_features + numeric_features + categorical_features]



Feature engineering completed.


In [10]:
# Make sure target exists
target_col = 'category'
if target_col not in df.columns:
    raise KeyError(f"Target column '{target_col}' not found in DataFrame.")
y = df[target_col]

# Save features and labels
X.to_pickle(OUTPUT_FILE_X)
y.to_pickle(OUTPUT_FILE_Y)

print(f"Feature matrix X saved: {OUTPUT_FILE_X}")
print(f"Target labels y saved: {OUTPUT_FILE_Y}")
print(f"X shape: {X.shape}, y shape: {y.shape}")

Feature matrix X saved: features\X_features.pkl
Target labels y saved: features\y_labels.pkl
X shape: (57293, 2383), y shape: (57293,)
