In [1]:
# feature_engineering.py
import pandas as pd
import numpy as np
from scipy.stats import entropy

INPUT = "synthetic_auth_logs_stress.csv"
OUT_FEATURES = "features_stress.csv"

def resource_entropy(series):
    probs = series.value_counts(normalize=True)
    if len(probs) <= 1:
        return 0.0
    return float(entropy(probs))

def device_new_rate(group):
    # number of distinct devices / total events -> star of device churn
    return group["device"].nunique() / max(1, len(group))

def failed_login_rate(group):
    return 1.0 - group["success"].mean()

def recent_failed_burst(group):
    # simple proxy: fraction of events with success==0 in last N events
    last_n = 5
    last = group.sort_values("timestamp").tail(last_n)
    return 1.0 - last["success"].mean()

df = pd.read_csv(INPUT)
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["hour"] = df["timestamp"].dt.hour

grouped = df.groupby("user")

features = pd.DataFrame({
    "avg_login_hour": grouped["hour"].mean(),
    "std_login_hour": grouped["hour"].std().fillna(0.0),
    "unique_countries": grouped["country"].nunique(),
    "unique_devices": grouped["device"].nunique(),
    "device_new_rate": grouped.apply(device_new_rate),
    "failed_login_rate": grouped.apply(failed_login_rate),
    "recent_failed_burst": grouped.apply(recent_failed_burst),
    "resource_entropy": grouped["resource"].apply(resource_entropy),
    "total_events": grouped.size()
})

# attach label (for evaluation only)
labels = grouped["is_attack"].max()
features["is_attack"] = labels.fillna(0).astype(int)

features.to_csv(OUT_FEATURES)
print("Wrote features to", OUT_FEATURES)
print(features.head())


  "device_new_rate": grouped.apply(device_new_rate),
  "failed_login_rate": grouped.apply(failed_login_rate),
  "recent_failed_burst": grouped.apply(recent_failed_burst),


Wrote features to features_stress.csv
                     avg_login_hour  std_login_hour  unique_countries  \
user                                                                    
user100@example.com        6.147929        0.737199                 6   
user101@example.com        5.219512        1.498665                 8   
user102@example.com        5.105882        0.792323                 5   
user103@example.com        5.244318        1.224732                 6   
user104@example.com        8.220779        2.151901                 6   

                     unique_devices  device_new_rate  failed_login_rate  \
user                                                                      
user100@example.com               7         0.041420           0.029586   
user101@example.com               4         0.024390           0.012195   
user102@example.com               4         0.023529           0.023529   
user103@example.com               7         0.039773           0.005682   
