# Step 3: Feature Engineering

In this step, we will create meaningful features from the cleaned datasets to improve fraud detection.
We will add time-based features, transaction frequency features, and integrate IP geolocation.


In [2]:
import pandas as pd
import numpy as np

In [3]:
fraud_df = pd.read_csv("../data/processed/Fraud_Data_cleaned.csv")
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")  # raw is fine for IP mapping
credit_df = pd.read_csv("../data/processed/creditcard_cleaned.csv")

In [4]:
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'], errors='coerce')
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'], errors='coerce')

In [5]:
# Time difference between signup and purchase
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

# Hour of day
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour

# Day of week
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek

In [6]:
# Count transactions per user
fraud_df['user_tx_count'] = fraud_df.groupby('user_id')['purchase_time'].transform('count')

# Count transactions per device
fraud_df['device_tx_count'] = fraud_df.groupby('device_id')['purchase_time'].transform('count')


In [8]:
# Convert IP address to integer
fraud_df['ip_address_int'] = fraud_df['ip_address'].astype(float).astype(int)

# Load IP-to-country mapping
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")

# Ensure correct data types
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].astype(int)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].astype(int)

# Sort for merge_asof
fraud_df = fraud_df.sort_values('ip_address_int')
ip_df = ip_df.sort_values('lower_bound_ip_address')

# Range-based merge
fraud_df = pd.merge_asof(
    fraud_df,
    ip_df,
    left_on='ip_address_int',
    right_on='lower_bound_ip_address',
    direction='backward'
)

# Keep only valid IP ranges
fraud_df = fraud_df[
    fraud_df['ip_address_int'] <= fraud_df['upper_bound_ip_address']
]


In [None]:
# Ensure IP ranges in IP dataset are int
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].astype(int)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].astype(int)

# Sort for merge_asof
fraud_df = fraud_df.sort_values('ip_address')
ip_df = ip_df.sort_values('lower_bound_ip_address')

# Range-based merge using pandas merge_asof
fraud_df = pd.merge_asof(
    fraud_df,
    ip_df,
    left_on='ip_address',
    right_on='lower_bound_ip_address',
    direction='backward'
)

# Keep only rows where IP is within the range
fraud_df = fraud_df[
    (fraud_df['ip_address'] >= fraud_df['lower_bound_ip_address']) &
    (fraud_df['ip_address'] <= fraud_df['upper_bound_ip_address'])
]

fraud_df['country'].value_counts()


country
United States                     58049
China                             12038
Japan                              7306
United Kingdom                     4490
Korea Republic of                  4162
                                  ...  
Dominica                              1
Gambia                                1
Vanuatu                               1
British Indian Ocean Territory        1
Nauru                                 1
Name: count, Length: 181, dtype: int64

In [9]:
country_fraud = (
    fraud_df.groupby('country')['class']
    .mean()
    .sort_values(ascending=False)
)

country_fraud.head(10)

country
Turkmenistan             1.000000
Namibia                  0.434783
Sri Lanka                0.419355
Luxembourg               0.388889
Virgin Islands (U.S.)    0.333333
Ecuador                  0.264151
Tunisia                  0.262712
Peru                     0.260504
Bolivia                  0.245283
Kuwait                   0.233333
Name: class, dtype: float64

In [None]:
fraud_df = pd.get_dummies(
    fraud_df,
    columns=['source', 'browser', 'sex', 'country'],
    drop_first=True
)


### Feature Engineering Summary

- Added `time_since_signup`, `hour_of_day`, `day_of_week`.
- Added user-level and device-level transaction counts.
- Merged IP geolocation to include `country` feature.
- Converted categorical variables into one-hot encoded columns for modeling.
- Dataset is now fully prepared for train-test split and resampling in imbalanced learning.
