In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the cleaned dataset
df = pd.read_csv('cleaned_transactions.csv')  # Make sure the path is correct

# 1. Handle Categorical Features
# Convert 'type' column to numerical using Label Encoding
label_encoder = LabelEncoder()
df['type'] = label_encoder.fit_transform(df['type'])

# Alternatively, if you prefer one-hot encoding (more suitable for many categories):
# df = pd.get_dummies(df, columns=['type'], drop_first=True)

# 2. Create New Features
# Transaction amount as a percentage of the old balance
df['amount_percentage'] = df['amount'] / df['oldbalanceOrg']

# Balance difference (how much balance has changed after the transaction)
df['balance_diff'] = df['oldbalanceOrg'] - df['newbalanceOrig']

# Add any other useful features based on insights (for example, scaling amounts, balances, etc.)
# You can also use step column to create features like transaction frequency over time

# 3. Drop or Modify Unnecessary Columns
# Drop 'nameOrig' and 'nameDest' as they are not directly useful for fraud detection
df.drop(columns=['nameOrig', 'nameDest'], inplace=True)

# 4. Scaling/Normalization
scaler = StandardScaler()
df[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']] = scaler.fit_transform(
    df[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']])

# Save the transformed dataset
df.to_csv('engineered_transactions.csv', index=False)

# Check the first few rows of the dataset after feature engineering
print(df.head())


   step  type    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     1     3 -0.281560      -0.229810       -0.237622       -0.323814   
1     1     3 -0.294767      -0.281359       -0.285812       -0.323814   
2     1     4 -0.297555      -0.288654       -0.292442       -0.323814   
3     1     1 -0.297555      -0.288654       -0.292442       -0.317582   
4     1     3 -0.278532      -0.274329       -0.282221       -0.323814   

   newbalanceDest  isFraud  isFlaggedFraud  amount_percentage  balance_diff  
0       -0.333411        0               0           1.225185      0.007812  
1       -0.333411        0               0           1.047654      0.004453  
2       -0.333411        1               0           1.030836      0.003788  
3       -0.333411        1               0           1.030836      0.003788  
4       -0.333411        0               0           1.015320      0.007892  
