In [1]:
# 02_feature_engineering.ipynb
# Feature Engineering for PaySim Fraud Detection

In [2]:
# --- Setup ---
import pandas as pd
import numpy as np

In [3]:
# Load dataset
path = "../data/raw/paysim.csv"
df = pd.read_csv(path)

In [4]:
# --- 1. Drop unnecessary columns ---
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

In [5]:
# --- 2 Encode 'type' ---
df['type'] = df['type'].astype('category')
df = pd.get_dummies(df, columns=['type']) # One-hot encoding

In [6]:
# --- 3. Log-transform amount ---
df['log_amount'] = np.log1p(df['amount'])

In [9]:
# --- 4. Balance deltas ---
df['deltaOrig'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['deltaDest'] = df['oldbalanceDest'] - df['newbalanceDest']


In [10]:
# --- 5. Balance error (from EDA) ---
df['errorOrig'] = df['oldbalanceOrg'] - df['amount'] - df['newbalanceOrig']
df['errorDest'] = df['oldbalanceDest'] + df['amount'] - df['newbalanceDest']
df['abs_errorOrig'] = df['errorOrig'].abs()
df['abs_errorDest'] = df['errorDest'].abs()

In [12]:
# --- 6. Time-based features ---
df['hour'] = df['step'] % 24
df['day'] = df['step'] // 24

In [13]:
# --- 7. High-value flag ---
df['is_high_value'] = (df['amount'] > 200000).astype(int)

In [17]:
# --- 8. Drop highly correlated features (from EDA) ---
df.drop(['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'amount', 'errorOrig', 'errorDest'], axis=1, inplace=True)

In [20]:
# --- 9. Final Check ---
print("Final feature columns:")
print(df.columns)
print("\nSample:")
print(df.head())

Final feature columns:
Index(['step', 'isFraud', 'isFlaggedFraud', 'type_CASH_IN', 'type_CASH_OUT',
       'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER', 'log_amount',
       'deltaOrig', 'deltaDest', 'abs_errorOrig', 'abs_errorDest', 'hour',
       'day', 'is_high_value'],
      dtype='object')

Sample:
   step  isFraud  isFlaggedFraud  type_CASH_IN  type_CASH_OUT  type_DEBIT  \
0     1        0               0         False          False       False   
1     1        0               0         False          False       False   
2     1        1               0         False          False       False   
3     1        1               0         False           True       False   
4     1        0               0         False          False       False   

   type_PAYMENT  type_TRANSFER  log_amount  deltaOrig  deltaDest  \
0          True          False    9.194276    9839.64        0.0   
1          True          False    7.531166    1864.28        0.0   
2         False           T

In [21]:
#--- 10. Save preprocessed version ---
df.to_csv("../data/processed/paysim_features.csv", index=False)

## ✅ Feature Engineering Summary

- Dropped irrelevant identifiers (`nameOrig`, `nameDest`)
- Encoded `type` using one-hot encoding
- Engineered new features:
  - `log_amount`
  - `deltaOrig`, `deltaDest`
  - `abs_errorOrig`, `abs_errorDest`
  - `hour`, `day` (from step)
  - `is_high_value` (amount > 200k)
- Removed highly correlated raw balance features
- Saved the final dataset to: `data/processed/paysim_features.csv`
