In [3]:
# -----------------------------
# 00_create_synthetic_data.ipynb
# -----------------------------

import pandas as pd
import numpy as np
import os

# -----------------------------
# 1️⃣ SETUP
# -----------------------------
np.random.seed(42)
n_transactions = 1000
n_customers = 50

# Determine project root (assumes notebook is in notebooks/)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
raw_data_dir = os.path.join(project_root, "data", "raw")
os.makedirs(raw_data_dir, exist_ok=True)

# -----------------------------
# 2️⃣ GENERATE & DOWNCAST DATA
# -----------------------------
df = pd.DataFrame({
    "transaction_id": pd.to_numeric(np.arange(1, n_transactions + 1), downcast='integer'),
    "customer_id": pd.to_numeric(np.random.randint(1, n_customers + 1, size=n_transactions), downcast='integer'),
    "amount": pd.to_numeric(np.round(np.random.exponential(scale=50, size=n_transactions), 2), downcast='float'),
    "merchant_category": pd.Series(np.random.choice(
        ["grocery", "electronics", "travel", "clothing", "restaurants"], 
        size=n_transactions
    )).astype('category'),
    "device": pd.Series(np.random.choice(["mobile", "desktop", "tablet"], size=n_transactions)).astype('category'),
    "location": pd.Series(np.random.choice(["NY", "CA", "TX", "FL", "IL"], size=n_transactions)).astype('category')
})

# Transaction timestamps
start_ts = pd.Timestamp("2025-01-01").value // 10**9
end_ts = pd.Timestamp("2025-12-31").value // 10**9
df['transaction_time'] = pd.to_datetime(np.random.randint(start_ts, end_ts, size=n_transactions), unit='s')

# -----------------------------
# 3️⃣ FEATURE ENGINEERING
# -----------------------------
df.sort_values(by=['customer_id', 'transaction_time'], inplace=True)
df['time_since_last_txn'] = df.groupby('customer_id')['transaction_time'].diff().dt.total_seconds().fillna(0).astype('float32')

# Normalized amount per customer
df['amount_norm'] = df['amount'] / df.groupby('customer_id')['amount'].transform('mean')

# -----------------------------
# 4️⃣ CREATE FRAUD LABELS
# -----------------------------
df['is_fraud'] = 0

# Patterned fraud: high normalized amount + short time since last txn
df.loc[(df['amount_norm'] > 2) & (df['time_since_last_txn'] < 5), 'is_fraud'] = 1

# Travel transactions flagged as fraud
df.loc[df['merchant_category'] == 'travel', 'is_fraud'] = 1

# Add 1% random fraud
random_idx = df.sample(frac=0.01, random_state=42).index
df.loc[random_idx, 'is_fraud'] = 1

# -----------------------------
# 5️⃣ ONE-HOT ENCODING
# -----------------------------
df = pd.get_dummies(df, columns=['merchant_category', 'device', 'location'], drop_first=True)

# -----------------------------
# 6️⃣ SAVE DATA
# -----------------------------
output_path = os.path.join(raw_data_dir, "engineered_transactions.parquet")
df.to_parquet(output_path, engine='fastparquet', index=False)

print(f"Synthetic dataset saved successfully in {output_path}!")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
df.head()


Synthetic dataset saved successfully in /home/andys/Work/JobSearch/Portfolio/credit-fraud-ml-system/data/raw/engineered_transactions.parquet!
Memory Usage: 0.08 MB


Unnamed: 0,transaction_id,customer_id,amount,transaction_time,time_since_last_txn,amount_norm,is_fraud,merchant_category_electronics,merchant_category_grocery,merchant_category_restaurants,merchant_category_travel,device_mobile,device_tablet,location_FL,location_IL,location_NY,location_TX
665,666,1,28.43,2025-01-07 15:24:41,0.0,0.492564,1,False,False,False,True,False,True,False,False,False,True
185,186,1,10.2,2025-01-09 02:31:10,126389.0,0.17672,1,False,False,False,True,True,False,False,True,False,False
907,908,1,27.58,2025-02-03 17:14:53,2213023.0,0.477838,1,False,False,False,True,False,True,False,True,False,False
280,281,1,34.400002,2025-03-06 04:41:50,2633217.0,0.595998,0,False,False,False,False,True,False,False,True,False,False
815,816,1,105.139999,2025-03-14 16:48:48,734818.0,1.821605,0,False,False,True,False,False,False,False,True,False,False
