In [1]:
# notebooks/02_feature_engineering.ipynb

# -------------------------------
# 1. Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style("whitegrid")

# -------------------------------
# 2. Load Dataset
# -------------------------------
DATA_PATH = "../data/fraud_dataset_mod.csv"
df = pd.read_csv(DATA_PATH)

# Display first 5 rows
df.head()

# -------------------------------
# 3. Check Missing Values
# -------------------------------
print("Missing values before preprocessing:\n", df.isnull().sum())

# -------------------------------
# 4. Handle Missing Values
# -------------------------------
# Fill numeric columns with median
numeric_cols = df.select_dtypes(include=['float64']).columns
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing values after preprocessing:\n", df.isnull().sum())

# -------------------------------
# 5. Feature Engineering
# -------------------------------
# 5.1 Convert Timestamp to datetime features
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Transaction_Hour'] = df['Timestamp'].dt.hour
df['Transaction_Day'] = df['Timestamp'].dt.day
df['Transaction_Month'] = df['Timestamp'].dt.month
df['Transaction_Weekday'] = df['Timestamp'].dt.weekday

# Drop original Timestamp
df.drop('Timestamp', axis=1, inplace=True)

# 5.2 Re-identify categorical columns AFTER dropping Timestamp
categorical_cols = df.select_dtypes(include=['object']).columns

# Encode categorical features
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


# -------------------------------
# 6. Standardize Numeric Features
# -------------------------------
scaler = StandardScaler()
num_cols_for_scaling = numeric_cols.drop('Fraud_Label')  # Do not scale target column
df[num_cols_for_scaling] = scaler.fit_transform(df[num_cols_for_scaling])

# -------------------------------
# 7. Save Processed Dataset
# -------------------------------
PROCESSED_PATH = "../data/fraud_dataset_processed.csv"
df.to_csv(PROCESSED_PATH, index=False)
print(f"Processed dataset saved: {PROCESSED_PATH}")


Missing values before preprocessing:
 Transaction_ID                  1104
User_ID                          948
Transaction_Amount               982
Transaction_Type                 982
Timestamp                        992
Account_Balance                 1016
Device_Type                     1023
Location                        1000
Merchant_Category                975
IP_Address_Flag                  924
Previous_Fraudulent_Activity    1031
Daily_Transaction_Count         1029
Avg_Transaction_Amount_7d        996
Failed_Transaction_Count_7d      983
Card_Type                       1012
Card_Age                         960
Transaction_Distance            1025
Authentication_Method           1025
Risk_Score                       982
Is_Weekend                       980
Fraud_Label                     1031
dtype: int64

Missing values after preprocessing:
 Transaction_ID                  0
User_ID                         0
Transaction_Amount              0
Transaction_Type                