# Feature Engineering â€“ Fraud_Data.csv

This notebook prepares the e-commerce fraud dataset for modeling by
creating time-based, geolocation, and behavioral features, followed by
data transformation.


In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler


In [2]:
# Load fraud transaction data
fraud_df = pd.read_csv("../data/raw/Fraud_Data.csv")

# Load IP to country mapping data
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")

# Convert datetime columns
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

fraud_df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [3]:
# Ensure IP bounds are integers
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].astype(int)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].astype(int)


In [5]:
# Convert transaction IP address to integer
fraud_df['ip_int'] = fraud_df['ip_address'].astype(int)


In [6]:
# Sort for merge_asof
fraud_df = fraud_df.sort_values('ip_int')
ip_df = ip_df.sort_values('lower_bound_ip_address')


In [7]:
# Merge using lower bound
fraud_geo_df = pd.merge_asof(
    fraud_df,
    ip_df,
    left_on='ip_int',
    right_on='lower_bound_ip_address',
    direction='backward'
)

# Filter using upper bound
fraud_geo_df = fraud_geo_df[
    fraud_geo_df['ip_int'] <= fraud_geo_df['upper_bound_ip_address']
]


In [8]:
# Re-convert datetime columns after merge
fraud_geo_df['signup_time'] = pd.to_datetime(fraud_geo_df['signup_time'])
fraud_geo_df['purchase_time'] = pd.to_datetime(fraud_geo_df['purchase_time'])


In [9]:
# Time since signup (seconds)
fraud_geo_df['time_since_signup'] = (
    fraud_geo_df['purchase_time'] - fraud_geo_df['signup_time']
).dt.total_seconds()

# Hour of day
fraud_geo_df['hour_of_day'] = fraud_geo_df['purchase_time'].dt.hour

# Day of week
fraud_geo_df['day_of_week'] = fraud_geo_df['purchase_time'].dt.dayofweek


In [10]:
# Sort by user and time
fraud_geo_df = fraud_geo_df.sort_values(
    by=['user_id', 'purchase_time']
)

# Total transactions per user
fraud_geo_df['user_transaction_count'] = (
    fraud_geo_df.groupby('user_id')['purchase_time']
    .transform('count')
)

# Time since previous transaction
fraud_geo_df['time_since_last_txn'] = (
    fraud_geo_df.groupby('user_id')['purchase_time']
    .diff()
    .dt.total_seconds()
)

# Flag transactions within 1 hour
fraud_geo_df['txn_within_1hr'] = (
    fraud_geo_df['time_since_last_txn'] <= 3600
).astype(int)


In [11]:
fraud_geo_df['time_since_last_txn'] = fraud_geo_df['time_since_last_txn'].fillna(-1)
fraud_geo_df['country'] = fraud_geo_df['country'].fillna('Unknown')


In [12]:
# Separate target
y = fraud_geo_df['class']

# Drop non-feature columns
X = fraud_geo_df.drop(
    columns=[
        'class',
        'signup_time',
        'purchase_time',
        'ip_address',
        'ip_int',
        'device_id'
    ]
)


In [13]:
X = pd.get_dummies(X, drop_first=True)


In [14]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])


In [15]:
processed_df = pd.concat([X, y.reset_index(drop=True)], axis=1)
processed_df.to_csv("../data/processed/fraud_processed.csv", index=False)
