In [1]:
import pandas as pd
import numpy as np
import os

# -----------------------------
# 1️⃣ GENERATE & DOWNCAST (Immediate RAM Savings)
# -----------------------------
np.random.seed(42)
n_transactions = 1000
n_customers = 50

df = pd.DataFrame({
    "transaction_id": pd.to_numeric(np.arange(1, n_transactions + 1), downcast='integer'),
    "customer_id": pd.to_numeric(np.random.randint(1, n_customers + 1, size=n_transactions), downcast='integer'),
    "amount": pd.to_numeric(np.round(np.random.exponential(scale=50, size=n_transactions), 2), downcast='float'),
    "merchant_category": pd.Series(np.random.choice(
        ["grocery", "electronics", "travel", "clothing", "restaurants"], 
        size=n_transactions
    )).astype('category'),
    "device": pd.Series(np.random.choice(["mobile", "desktop", "tablet"], size=n_transactions)).astype('category'),
    "location": pd.Series(np.random.choice(["NY", "CA", "TX", "FL", "IL"], size=n_transactions)).astype('category')
})

# -----------------------------
# 2️⃣ FEATURE ENGINEERING
# -----------------------------
start_ts = pd.Timestamp("2025-01-01").value // 10**9
end_ts = pd.Timestamp("2025-12-31").value // 10**9
df['transaction_time'] = pd.to_datetime(np.random.randint(start_ts, end_ts, size=n_transactions), unit='s')

df.sort_values(by=['customer_id', 'transaction_time'], inplace=True)
df['time_since_last_txn'] = df.groupby('customer_id')['transaction_time'].diff().dt.total_seconds().fillna(0).astype('float32')

# -----------------------------
# 3️⃣ CREATE FRAUD LABELS
# -----------------------------
df['is_fraud'] = 0

# Patterned fraud: high normalized amount + short time since last txn
df['amount_norm'] = df['amount'] / df.groupby('customer_id')['amount'].transform('mean')
df.loc[(df['amount_norm'] > 2) & (df['time_since_last_txn'] < 5), 'is_fraud'] = 1

# Mark travel transactions as fraud
if 'merchant_category_travel' in df.columns:
    df.loc[df['merchant_category_travel'] == 1, 'is_fraud'] = 1

# Add random 1% fraud
random_idx = df.sample(frac=0.01, random_state=42).index
df.loc[random_idx, 'is_fraud'] = 1

# -----------------------------
# 4️⃣ ONE-HOT ENCODING
# -----------------------------
df = pd.get_dummies(df, columns=['merchant_category','device','location'], drop_first=True)

# -----------------------------
# 5️⃣ SAVE DATA
# -----------------------------
os.makedirs("data", exist_ok=True)
df.to_parquet("data/engineered_transactions.parquet", engine='fastparquet', index=False)

print("Synthetic dataset with 'is_fraud' saved successfully!")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(df[['amount', 'amount_norm', 'time_since_last_txn', 'is_fraud']].head())


Synthetic dataset with 'is_fraud' saved successfully!
Memory Usage: 0.08 MB
         amount  amount_norm  time_since_last_txn  is_fraud
665   28.430000     0.492564                  0.0         0
185   10.200000     0.176720             126389.0         0
907   27.580000     0.477838            2213023.0         0
280   34.400002     0.595998            2633217.0         0
815  105.139999     1.821605             734818.0         0
