In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import json

# Load cleaned
df = pd.read_csv('data/processed/clean.csv')

# Feature engineering
df['amount_log'] = np.log1p(df['Amount'])
df['hour_of_day'] = (df['Time'] / 3600) % 24
df['amount_zscore'] = (df['Amount'] - df['Amount'].mean()) / df['Amount'].std()
# Global rolling mean (no user_id; use Time-sorted rolling on Amount)
df_sorted = df.sort_values('Time')
df['rolling_amount_mean'] = df_sorted['Amount'].rolling(window=100, min_periods=1).mean().values  # Realign

# Scale numerical (Amount, Time, V1-V28, new feats)
scale_cols = ['Time', 'Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
              'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
              'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
              'amount_log', 'hour_of_day', 'amount_zscore', 'rolling_amount_mean']
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# Train/valid split (stratify on Class)
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Save
train_df = X_train.copy(); train_df['Class'] = y_train
val_df = X_val.copy(); val_df['Class'] = y_val
train_df.to_csv('data/processed/train_features.csv', index=False)
val_df.to_csv('data/processed/val_features.csv', index=False)

# Feature list
feature_columns = X.columns.tolist()
with open('data/processed/feature_columns.json', 'w') as f:
    json.dump(feature_columns, f)