# Feature Engineering: Fraud_Data

Focus: Transaction velocity and time-based features for the e-commerce dataset.

In [6]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta

RAW_PATH = '../data/raw/'
PROCESSED_PATH = '../data/processed/'
os.makedirs(PROCESSED_PATH, exist_ok=True)

## 1. Load cleaned Fraud_Data

In [7]:
fraud_path = os.path.join(PROCESSED_PATH, 'fraud_processed.csv')
if not os.path.exists(fraud_path):
    raise FileNotFoundError(f'Missing file: {fraud_path}. Run the EDA notebook to generate it.')

fraud_df = pd.read_csv(fraud_path, parse_dates=['signup_time', 'purchase_time'])
print('Loaded fraud_processed:', fraud_df.shape)
fraud_df.head()

Loaded fraud_processed: (151112, 18)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,lower_bound_ip_address,upper_bound_ip_address,country,time_since_signup,hour_of_day,day_of_week,transactions_per_device
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,46,ZCLZTAJPCRAQX,Direct,Safari,M,36,52093,0,,,Unknown,489.726111,10,6,1
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,33,YFGYOALADBHLT,Ads,IE,F,30,93447,0,,,Unknown,301.339722,17,4,1
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,33,QZNVQTUITFTHH,Direct,FireFox,F,32,105818,0,,,Unknown,208.144444,8,1,1
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,33,PIBUQMBIELMMG,Ads,IE,M,40,117566,0,,,Unknown,2065.176111,21,3,1
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,55,WFIIFCPIOGMHT,Ads,Safari,M,38,131423,0,,,Unknown,391.005278,7,6,1


## 2. Time-based features
- hour_of_day
- day_of_week
- time_since_signup (hours)

In [8]:
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 3600
fraud_df[['hour_of_day', 'day_of_week', 'time_since_signup']].head()

Unnamed: 0,hour_of_day,day_of_week,time_since_signup
0,10,6,489.726111
1,17,4,301.339722
2,8,1,208.144444
3,21,3,2065.176111
4,7,6,391.005278


## 3. Transaction velocity per user
- tx_count_24h: number of transactions by user in the past 24 hours
- tx_count_1h: number of transactions by user in the past 1 hour

In [None]:
fraud_df = fraud_df.sort_values(['user_id', 'purchase_time'])

def count_prev_within(series, window_hours):
    # series is datetime, sorted; returns count of prior events within window_hours
    arr = series.view('int64')  # ns since epoch
    window_ns = int(window_hours * 3600 * 1e9)
    left_idx = np.searchsorted(arr, arr - window_ns, side='left')
    # number of previous events within window = current index - left_idx
    return np.arange(len(arr)) - left_idx

fraud_df['tx_count_24h'] = fraud_df.groupby('user_id')['purchase_time'].transform(
    lambda s: pd.Series(count_prev_within(s.reset_index(drop=True), 24), index=s.index)
)
fraud_df['tx_count_1h'] = fraud_df.groupby('user_id')['purchase_time'].transform(
    lambda s: pd.Series(count_prev_within(s.reset_index(drop=True), 1), index=s.index)
)

fraud_df[['user_id', 'purchase_time', 'tx_count_1h', 'tx_count_24h']].head()

  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = series.view('int64')  # ns since epoch
  arr = serie

## 4. Save engineered dataset

In [5]:
out_path = os.path.join(PROCESSED_PATH, 'fraud_features.csv')
fraud_df.to_csv(out_path, index=False)
print(f'Saved features to {out_path}')

Saved features to ../data/processed/fraud_features.csv
