In [19]:
# Notebook 4 (Option 2 - faster) FULL CODE
# Creates time-based + velocity features and saves ../data/processed/fraud_featured.csv

import pandas as pd
import numpy as np

# -------------------------
# 1) Load raw data
# -------------------------
fraud = pd.read_csv("../data/raw/Fraud_Data.csv")

# -------------------------
# 2) Parse timestamps + basic cleaning
# -------------------------
fraud["signup_time"] = pd.to_datetime(fraud["signup_time"], errors="coerce")
fraud["purchase_time"] = pd.to_datetime(fraud["purchase_time"], errors="coerce")

# Drop rows with broken timestamps (usually very few). Keeps code stable.
fraud = fraud.dropna(subset=["signup_time", "purchase_time"]).copy()

# Remove duplicates if any
fraud = fraud.drop_duplicates().copy()

# Ensure purchase_value is numeric (just in case)
fraud["purchase_value"] = pd.to_numeric(fraud["purchase_value"], errors="coerce")
fraud["purchase_value"] = fraud["purchase_value"].fillna(fraud["purchase_value"].median())

# Sort (required for correct time features)
fraud = fraud.sort_values(["user_id", "purchase_time"]).reset_index(drop=True)

# -------------------------
# 3) Time-based features
# -------------------------
fraud["hour_of_day"] = fraud["purchase_time"].dt.hour
fraud["day_of_week"] = fraud["purchase_time"].dt.dayofweek
fraud["time_since_signup"] = (fraud["purchase_time"] - fraud["signup_time"]).dt.total_seconds()

# -------------------------
# 4) Transaction velocity features (faster, version-safe)
# tx_count_1h  = number of tx per user in the last 1 hour (including current)
# tx_count_24h = number of tx per user in the last 24 hours (including current)
# -------------------------
fraud["tx_count_1h"] = 0
fraud["tx_count_24h"] = 0

# Loop per user (much faster than scanning whole dataset each time)
for user_id, group in fraud.groupby("user_id", sort=False):
    idx = group.index
    times = group["purchase_time"].values  # numpy datetime64 array

    counts_1h = []
    counts_24h = []

    for t in times:
        counts_1h.append(((times >= t - np.timedelta64(1, "h")) & (times <= t)).sum())
        counts_24h.append(((times >= t - np.timedelta64(24, "h")) & (times <= t)).sum())

    fraud.loc[idx, "tx_count_1h"] = counts_1h
    fraud.loc[idx, "tx_count_24h"] = counts_24h

# -------------------------
# 5) Time since previous transaction per user
# -------------------------
fraud["time_since_prev_tx"] = (
    fraud.groupby("user_id")["purchase_time"]
    .diff()
    .dt.total_seconds()
)

# Fill first transaction per user with median gap (or 0 if you prefer)
median_gap = fraud["time_since_prev_tx"].median()
fraud["time_since_prev_tx"] = fraud["time_since_prev_tx"].fillna(median_gap)

# -------------------------
# 6) (Optional) basic sanity checks
# -------------------------
print("Rows:", len(fraud))
print(fraud[["tx_count_1h", "tx_count_24h", "time_since_prev_tx"]].describe())

# -------------------------
# 7) Save processed dataset
# -------------------------
out_path = "../data/processed/fraud_featured.csv"
fraud.to_csv(out_path, index=False)
print("Saved to:", out_path)

# Preview
fraud[["user_id", "purchase_time", "tx_count_1h", "tx_count_24h", "time_since_prev_tx"]].head(10)


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Rows: 151112
       tx_count_1h  tx_count_24h  time_since_prev_tx
count     151112.0      151112.0                 0.0
mean           1.0           1.0                 NaN
std            0.0           0.0                 NaN
min            1.0           1.0                 NaN
25%            1.0           1.0                 NaN
50%            1.0           1.0                 NaN
75%            1.0           1.0                 NaN
max            1.0           1.0                 NaN
Saved to: ../data/processed/fraud_featured.csv


Unnamed: 0,user_id,purchase_time,tx_count_1h,tx_count_24h,time_since_prev_tx
0,2,2015-02-21 10:03:37,1,1,
1,4,2015-09-26 21:32:16,1,1,
2,8,2015-08-13 11:53:07,1,1,
3,9,2015-05-20 23:06:42,1,1,
4,12,2015-03-04 20:56:37,1,1,
5,16,2015-03-12 12:46:23,1,1,
6,18,2015-10-23 00:18:57,1,1,
7,26,2015-03-21 09:04:08,1,1,
8,33,2015-10-28 18:12:41,1,1,
9,39,2015-01-08 18:13:26,1,1,
