In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

def generate_synthetic_dataset(
    n_samples: int = 1_000_000,
    fraud_ratio: float = 0.15,
    random_state: int = 42,
) -> pd.DataFrame:
    """
    Generate a realistic 1M-row transaction dataset with:
      - Many transactions per user (20–80)
      - Repeated devices and IPs
      - Temporal sequences (timestamps in order)
      - Embedded fraud patterns:
          * large_amount
          * suspicious_merchant / high risk category
          * round_amount
          * night_fraud
          * foreign_location
          * new_device_high_amount
    Columns:
      transaction_id,user_id,timestamp,amount,location,merchant,
      merchant_category,device_id,ip_address,is_fraud
    """
    rng = np.random.default_rng(random_state)

    indian_cities = [
        "Mumbai", "Delhi", "Bangalore", "Hyderabad", "Chennai",
        "Kolkata", "Pune", "Ahmedabad", "Jaipur", "Lucknow"
    ]
    foreign_cities = [
        "Dubai", "Singapore", "New York", "London", "Bangkok", "Sydney"
    ]

    trusted_merchants = [
        "Amazon", "Flipkart", "Swiggy", "Zomato",
        "BigBazaar", "Reliance", "PhonePe", "PayTM"
    ]
    risky_merchants = [
        "Gambling_Site", "Crypto_Exchange", "Unknown_Vendor", "Suspicious_Store"
    ]

    categories = ["Shopping", "Food", "Grocery", "Entertainment"]

    start_date = datetime(2025, 1, 1)

    # Decide how many users – around 1M / 40 ≈ 25K users
    n_users = max(8_000, n_samples // 40)
    print(f"Generating profiles for {n_users} users...")

    user_ids = [f"USER_{i:06d}" for i in range(1, n_users + 1)]

    # Per-user base behaviour
    user_base_amount = rng.lognormal(mean=7.2, sigma=0.5, size=n_users)  # ~1500–8000
    user_city_choices = rng.choice(indian_cities, size=(n_users, 3))
    user_pref_merchants = rng.choice(trusted_merchants, size=(n_users, 4))

    # Devices and IPs per user
    user_devices = {}
    user_ips = {}
    for i, uid in enumerate(user_ids):
        n_dev = rng.integers(1, 4)  # 1–3 devices
        devs = [f"DEV_{rng.integers(1000, 30000)}" for _ in range(n_dev)]
        user_devices[uid] = devs

        # 1–3 IPs in same private range 192.168.X.*
        base_a = rng.integers(0, 256)
        ips = [f"192.168.{base_a}.{rng.integers(1, 255)}" for _ in range(rng.integers(1, 4))]
        user_ips[uid] = ips

    data = []
    fraud_quota = int(n_samples * fraud_ratio)
    fraud_count = 0
    tx_global_idx = 0

    print("Generating transactions...")
    for u_idx, uid in enumerate(tqdm(user_ids)):
        # 20–80 transactions per user
        k = int(rng.integers(20, 80))
        # Random starting date for this user
        ts = start_date + timedelta(days=int(rng.integers(0, 180)),
                                    hours=int(rng.integers(6, 22)))

        # Each user occasionally gets targeted by fraud
        user_fraudiness = rng.uniform(0.05, 0.3)  # prob that a tx is fraudulent

        last_city = None

        for t in range(k):
            if tx_global_idx >= n_samples:
                break

            # Base legitimate behaviour
            base_amt = user_base_amount[u_idx]
            amount = float(rng.lognormal(mean=np.log(base_amt), sigma=0.4))
            amount = max(50.0, min(amount, 80_000.0))

            city = str(user_city_choices[u_idx, rng.integers(0, 3)])
            merchant = str(user_pref_merchants[u_idx, rng.integers(0, 4)])
            category = str(rng.choice(categories))
            device_id = rng.choice(user_devices[uid])
            ip_address = rng.choice(user_ips[uid])

            # Time jumps 10 min – 1 day between tx
            ts += timedelta(minutes=int(rng.integers(10, 60*24)))

            is_fraud = 0

            # Decide if we inject fraud on this tx
            if fraud_count < fraud_quota and rng.random() < user_fraudiness:
                is_fraud = 1
                fraud_count += 1

                fraud_type = rng.choice([
                    "large_amount",
                    "suspicious_merchant",
                    "round_amount",
                    "night_fraud",
                    "foreign_location",
                    "new_device_high_amount",
                ])

                if fraud_type == "large_amount":
                    amount = float(rng.uniform(80_000, 200_000))
                    merchant = rng.choice(["Unknown_Vendor", "Suspicious_Store"])
                    category = "High Risk"
                    # Often appears at odd times
                    if rng.random() < 0.5:
                        ts = ts.replace(hour=int(rng.integers(0, 6)))

                elif fraud_type == "suspicious_merchant":
                    merchant = rng.choice(risky_merchants)
                    category = "High Risk"
                    amount = float(rng.uniform(15_000, 90_000))

                elif fraud_type == "round_amount":
                    amount = float(rng.choice([1000, 2000, 5000, 10000, 25000, 50000]))

                elif fraud_type == "night_fraud":
                    ts = ts.replace(hour=int(rng.integers(0, 5)))
                    amount = float(rng.uniform(20_000, 100_000))

                elif fraud_type == "foreign_location":
                    city = rng.choice(foreign_cities)
                    merchant = "Foreign_Merchant"
                    category = "Foreign"
                    amount = float(rng.uniform(20_000, 80_000))
                    # keep Indian IP to look like stolen card abroad
                    ip_address = rng.choice(user_ips[uid])

                elif fraud_type == "new_device_high_amount":
                    device_id = f"DEV_{rng.integers(40_000, 80_000)}"  # device outside normal range
                    amount = float(rng.uniform(25_000, 120_000))

                # Also: many fraud IPs in 10.0.x.x
                if rng.random() < 0.8:
                    ip_address = f"10.0.{rng.integers(1,255)}.{rng.integers(1,255)}"

            # Construct row
            row = {
                "transaction_id": f"TXN_{tx_global_idx:08d}",
                "user_id": uid,
                "timestamp": ts,
                "amount": round(amount, 2),
                "location": city,
                "merchant": merchant,
                "merchant_category": category,
                "device_id": device_id,
                "ip_address": ip_address,
                "is_fraud": int(is_fraud),
            }
            data.append(row)
            tx_global_idx += 1

        if tx_global_idx >= n_samples:
            break

    df = pd.DataFrame(data)

    # Shuffle to mix users & times
    df = df.sample(frac=1.0, random_state=random_state).reset_index(drop=True)

    print(f"\n✅ Generated dataset:")
    print(f"  Total rows: {len(df):,}")
    print(f"  Fraud rows: {df['is_fraud'].sum():,} "
          f"({df['is_fraud'].mean()*100:.2f}%)")

    return df


In [2]:
df_train = generate_synthetic_dataset(n_samples=1_000_000, fraud_ratio=0.15)
df_train.to_csv("transactions_1M_synthetic.csv", index=False)
print("Saved transactions_1M_synthetic.csv")


Generating profiles for 25000 users...
Generating transactions...


 81%|████████  | 20233/25000 [00:23<00:05, 871.66it/s] 



✅ Generated dataset:
  Total rows: 1,000,000
  Fraud rows: 150,000 (15.00%)
Saved transactions_1M_synthetic.csv


**************************************************** TESTING DATASET ******************************************************

In [5]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

def generate_test_dataset(
    n_samples: int = 5000,
    fraud_ratio: float = 0.12,
    random_state: int = 123
) -> pd.DataFrame:

    rng = np.random.default_rng(random_state)

    # Similar but different from training
    indian_cities = [
        "Surat", "Indore", "Nagpur", "Patna", "Kanpur",
        "Mumbai", "Delhi", "Bangalore"
    ]
    foreign_cities = ["Berlin", "Paris", "Toronto", "Doha"]

    trusted_merchants = [
        "Amazon", "Flipkart", "Swiggy", "Zomato",
        "StarBazaar", "RelianceMart"
    ]
    risky_merchants = [
        "Crypto_Exchange", "Unknown_Vendor", "FraudStore", "Gambling_Site"
    ]

    categories = ["Shopping", "Food", "Grocery", "Entertainment"]

    start_date = datetime(2025, 6, 1)

    # Decide number of users: around 5000 / 20 ≈ 250
    n_users = max(150, n_samples // 20)
    user_ids = [f"U{str(i).zfill(6)}" for i in range(n_users)]

    # User profiles
    user_base_amount = rng.lognormal(mean=7.1, sigma=0.6, size=n_users)

    user_devices = {}
    user_ips = {}

    for uid in user_ids:
        n_dev = rng.integers(1, 3)
        user_devices[uid] = [f"D{rng.integers(1000,9000)}" for _ in range(n_dev)]
        base_b = rng.integers(0, 255)
        user_ips[uid] = [f"192.168.{base_b}.{rng.integers(1,255)}" for _ in range(n_dev)]

    data = []
    fraud_quota = int(n_samples * fraud_ratio)
    fraud_count = 0
    tx_idx = 0

    print("Generating test transactions...")

    for uid_idx, uid in enumerate(tqdm(user_ids)):
        # Each user 10–30 transactions
        k = int(rng.integers(10, 30))
        ts = start_date + timedelta(days=int(rng.integers(0, 30)),
                                    hours=int(rng.integers(8, 20)))

        user_fraudiness = rng.uniform(0.05, 0.2)

        for t in range(k):

            if tx_idx >= n_samples:
                break

            amount = float(rng.lognormal(mean=np.log(user_base_amount[uid_idx]), sigma=0.5))
            amount = np.clip(amount, 20, 60000)

            city = rng.choice(indian_cities)
            merchant = rng.choice(trusted_merchants)
            category = rng.choice(categories)
            device_id = rng.choice(user_devices[uid])
            ip_address = rng.choice(user_ips[uid])

            ts += timedelta(minutes=int(rng.integers(5, 500)))

            is_fraud = 0

            # Inject test fraud patterns (slightly changed)
            if fraud_count < fraud_quota and rng.random() < user_fraudiness:
                is_fraud = 1
                fraud_count += 1

                f_type = rng.choice([
                    "big_tx",
                    "foreign",
                    "risky_merchant",
                    "night",
                    "new_device",
                    "round",
                ])

                if f_type == "big_tx":
                    amount = float(rng.uniform(50_000, 150_000))
                    merchant = rng.choice(risky_merchants)
                    category = "High Risk"

                elif f_type == "foreign":
                    city = rng.choice(foreign_cities)
                    merchant = "ForeignMerchant"
                    category = "Foreign"

                elif f_type == "risky_merchant":
                    merchant = rng.choice(risky_merchants)
                    amount = float(rng.uniform(20_000, 90_000))

                elif f_type == "night":
                    ts = ts.replace(hour=int(rng.integers(0, 5)))
                    amount = float(rng.uniform(10_000, 60_000))

                elif f_type == "new_device":
                    device_id = f"D{rng.integers(9000,15000)}"
                    amount = float(rng.uniform(15000, 100000))

                elif f_type == "round":
                    amount = float(rng.choice([1000, 2000, 5000, 10000, 25000, 30000]))

                # Fraud IPs
                if rng.random() < 0.8:
                    ip_address = f"10.0.{rng.integers(1,255)}.{rng.integers(1,255)}"

            row = {
                "transaction_id": f"TST_{tx_idx:06d}",
                "user_id": uid,
                "timestamp": ts,
                "amount": round(amount, 2),
                "location": city,
                "merchant": merchant,
                "merchant_category": category,
                "device_id": device_id,
                "ip_address": ip_address,
                "is_fraud": is_fraud,
            }

            data.append(row)
            tx_idx += 1

        if tx_idx >= n_samples:
            break

    df = pd.DataFrame(data)
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    print("\n✔ TEST dataset created")
    print(f"Rows: {len(df)}")
    print(f"Fraud rows: {df['is_fraud'].sum()} ({df['is_fraud'].mean()*100:.2f}%)")

    return df


In [6]:
df_test = generate_test_dataset(n_samples=5000, fraud_ratio=0.12)
df_test.to_csv("test_synthetic_5000.csv", index=False)
print("Saved test_synthetic_5000.csv")


Generating test transactions...


100%|██████████| 250/250 [00:00<00:00, 1400.91it/s]


✔ TEST dataset created
Rows: 4976
Fraud rows: 600 (12.06%)





Saved test_synthetic_5000.csv


In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

def generate_robust_synthetic_dataset(
    n_samples: int = 1_000_000,
    fraud_ratio: float = 0.15,
    random_state: int = 42,
) -> pd.DataFrame:
    """
    Generate a production-grade synthetic fraud dataset with realistic patterns:
      - Velocity Bursts (rapid-fire transactions)
      - Shared Device Attacks (botnets/collusion)
      - Impossible Travel (teleportation across cities)
      - Class Overlap (normal high spenders vs fraud)
      - Temporal Seasonality (weekend night attacks)
    """
    rng = np.random.default_rng(random_state)

    # --- 1. CONFIGURATION & DICTIONARIES ---
    indian_cities = [
        "Mumbai", "Delhi", "Bangalore", "Hyderabad", "Chennai",
        "Kolkata", "Pune", "Ahmedabad", "Jaipur", "Lucknow"
    ]
    foreign_cities = [
        "Dubai", "Singapore", "New York", "London", "Bangkok", "Sydney"
    ]
    
    # Lat/Lon for distance calculations (feature engineering readiness)
    city_coords = {
        "Mumbai": (19.07, 72.87), "Delhi": (28.70, 77.10),
        "Bangalore": (12.97, 77.59), "Hyderabad": (17.38, 78.48),
        "Chennai": (13.08, 80.27), "Kolkata": (22.57, 88.36),
        "Pune": (18.52, 73.85), "Ahmedabad": (23.02, 72.57),
        "Jaipur": (26.91, 75.78), "Lucknow": (26.84, 80.94),
        "Dubai": (25.20, 55.27), "Singapore": (1.35, 103.81),
        "New York": (40.71, -74.00), "London": (51.50, -0.12),
        "Bangkok": (13.75, 100.50), "Sydney": (-33.86, 151.20)
    }

    trusted_merchants = [
        "Amazon", "Flipkart", "Swiggy", "Zomato",
        "BigBazaar", "Reliance", "PhonePe", "PayTM"
    ]
    risky_merchants = [
        "Gambling_Site", "Crypto_Exchange", "Unknown_Vendor", "Suspicious_Store"
    ]
    categories = ["Shopping", "Food", "Grocery", "Entertainment"]

    # Improvement 2: Shared Device Pool (Botnet/Account Takeover)
    # These 50 devices will appear across MANY different users
    fraud_device_pool = [f"DEV_BOTNET_{i:03d}" for i in range(50)]

    # --- 2. USER PROFILING ---
    start_date = datetime(2025, 1, 1)
    n_users = max(8_000, n_samples // 40)
    print(f"Generating profiles for {n_users} users...")

    user_ids = [f"USER_{i:06d}" for i in range(1, n_users + 1)]
    
    # User Preferences
    user_base_amount = rng.lognormal(mean=7.2, sigma=0.5, size=n_users)  # ~1500–8000
    user_city_choices = rng.choice(indian_cities, size=(n_users, 3))
    user_pref_merchants = rng.choice(trusted_merchants, size=(n_users, 4))

    # Devices and IPs per user
    user_devices = {}
    user_ips = {}
    for uid in user_ids:
        n_dev = rng.integers(1, 4)
        user_devices[uid] = [f"DEV_{rng.integers(1000, 50000)}" for _ in range(n_dev)]
        
        base_a = rng.integers(0, 256)
        user_ips[uid] = [f"192.168.{base_a}.{rng.integers(1, 255)}" for _ in range(rng.integers(1, 4))]

    # --- 3. TRANSACTION GENERATION ---
    data = []
    fraud_quota = int(n_samples * fraud_ratio)
    fraud_count = 0
    tx_global_idx = 0

    print("Generating transactions...")
    for u_idx, uid in enumerate(tqdm(user_ids)):
        if tx_global_idx >= n_samples:
            break
            
        k = int(rng.integers(20, 80)) # Transactions per user
        ts = start_date + timedelta(days=int(rng.integers(0, 180)))
        
        user_fraudiness = rng.uniform(0.05, 0.3)
        last_city = user_city_choices[u_idx, 0] # Initialize location

        for _ in range(k):
            # --- Improvement 5: Temporal Seasonality (Attack Windows) ---
            # Weekend nights (Friday/Saturday 2 AM - 4 AM) are high risk
            is_attack_window = (ts.weekday() >= 4) and (2 <= ts.hour < 5)
            
            # Adjust fraud probability based on window
            current_prob = user_fraudiness
            if is_attack_window:
                current_prob *= 3.0 # Spike fraud probability
            
            # Determine Label
            is_fraud = 0
            if fraud_count < fraud_quota and rng.random() < current_prob:
                is_fraud = 1
                fraud_count += 1
            
            # --- Improvement 1: Velocity Logic (Time Gaps) ---
            if is_fraud:
                # Fraudsters hit fast: 1s to 60s gaps
                time_gap = timedelta(seconds=int(rng.integers(1, 120)))
            else:
                # Normal behavior: mostly spread out, but occasionally bursts (shopping spree)
                if rng.random() < 0.15: # 15% chance of normal burst
                    time_gap = timedelta(seconds=int(rng.integers(60, 600)))
                else:
                    time_gap = timedelta(minutes=int(rng.integers(30, 4000)))
            
            ts += time_gap

            # --- Attribute Generation ---
            # Improvement 4: Overlap Normal Amounts (Up to 150k)
            base_amt = user_base_amount[u_idx]
            amount = float(rng.lognormal(mean=np.log(base_amt), sigma=0.6))
            amount = max(50.0, min(amount, 150_000.0)) 

            city = str(user_city_choices[u_idx, rng.integers(0, 3)])
            merchant = str(user_pref_merchants[u_idx, rng.integers(0, 4)])
            category = str(rng.choice(categories))
            device_id = rng.choice(user_devices[uid])
            ip_address = rng.choice(user_ips[uid])

            # --- FRAUD INJECTION LOGIC ---
            if is_fraud:
                fraud_type = rng.choice([
                    "large_amount",
                    "suspicious_merchant",
                    "account_takeover", # New
                    "impossible_travel", # New
                    "round_amount",
                    "night_fraud"
                ])

                if fraud_type == "large_amount":
                    # Improvement 4: Overlap with normal heavy spenders
                    amount = float(rng.uniform(60_000, 300_000)) 
                    merchant = rng.choice(["Unknown_Vendor", "Suspicious_Store"])
                    category = "High Risk"

                elif fraud_type == "suspicious_merchant":
                    merchant = rng.choice(risky_merchants)
                    category = "High Risk"
                    amount = float(rng.uniform(15_000, 90_000))

                elif fraud_type == "account_takeover":
                    # Improvement 2: Shared Device Attack
                    # User looks normal (location/amount), but device is from the fraud pool
                    device_id = rng.choice(fraud_device_pool)
                    amount = float(rng.uniform(500, 5000)) # Small test transactions
                    # Often happens in bursts, time gap is already handled above

                elif fraud_type == "impossible_travel":
                    # Improvement 3: Physical Impossibility
                    # Force a city different from the last one with the short time gap
                    available_cities = [c for c in indian_cities if c != last_city]
                    city = rng.choice(available_cities)
                    merchant = "Travel_Agent"
                    amount = float(rng.uniform(10_000, 50_000))
                    # Note: The time gap is naturally short (seconds/mins) from the Velocity logic above

                elif fraud_type == "round_amount":
                    amount = float(rng.choice([5000, 10000, 20000, 25000, 50000]))

                elif fraud_type == "night_fraud":
                    # Force time to 2 AM - 4 AM if not already
                    ts = ts.replace(hour=int(rng.integers(2, 5)))
                    amount = float(rng.uniform(20_000, 100_000))

                # IP Spoofing (Generic indicator)
                if rng.random() < 0.6:
                    ip_address = f"10.0.{rng.integers(1,255)}.{rng.integers(1,255)}"

            # Update State
            last_city = city
            
            # Construct Row
            row = {
                "transaction_id": f"TXN_{tx_global_idx:08d}",
                "user_id": uid,
                "timestamp": ts,
                "amount": round(amount, 2),
                "location": city,
                "merchant": merchant,
                "merchant_category": category,
                "device_id": device_id,
                "ip_address": ip_address,
                "is_fraud": int(is_fraud),
            }
            data.append(row)
            tx_global_idx += 1

    df = pd.DataFrame(data)
    
    # Shuffle dataset
    df = df.sample(frac=1.0, random_state=random_state).reset_index(drop=True)

    print(f"\n✅ Robust dataset generated:")
    print(f"  Total rows: {len(df):,}")
    print(f"  Fraud rows: {df['is_fraud'].sum():,} ({df['is_fraud'].mean()*100:.2f}%)")
    print(f"  Unique Devices: {df['device_id'].nunique():,}")
    
    return df

# --- EXECUTION ---
if __name__ == "__main__":
    df = generate_robust_synthetic_dataset(n_samples=2_000_00, fraud_ratio=0.12)
    df.to_csv("fraud_data_robust.csv", index=False)
    print(df.head())

Generating profiles for 8000 users...
Generating transactions...


 51%|█████     | 4068/8000 [00:04<00:04, 866.07it/s]



✅ Robust dataset generated:
  Total rows: 200,017
  Fraud rows: 24,000 (12.00%)
  Unique Devices: 7,549
  transaction_id      user_id           timestamp    amount   location  \
0   TXN_00167332  USER_003391 2025-06-08 02:03:11    295.07     Mumbai   
1   TXN_00024979  USER_000499 2025-01-20 19:22:11    897.21     Mumbai   
2   TXN_00100089  USER_002010 2025-02-17 08:57:28  14051.44      Delhi   
3   TXN_00163300  USER_003304 2025-06-02 11:03:00   4950.82  Ahmedabad   
4   TXN_00043396  USER_000860 2025-03-30 08:43:11   1485.51       Pune   

   merchant merchant_category  device_id       ip_address  is_fraud  
0   PhonePe              Food  DEV_20164   192.168.176.87         0  
1    Swiggy           Grocery  DEV_40218  192.168.155.120         0  
2    Swiggy              Food  DEV_40823    192.168.19.29         0  
3    Amazon          Shopping  DEV_20070  192.168.187.153         0  
4  Flipkart           Grocery  DEV_22863   192.168.89.116         0  


In [5]:
from sklearn.ensemble import IsolationForest

# 1. Select numeric features (Amount)
# Note: In a real project, you would also encode the categorical columns (Merchant, Location)
X = df_unlabeled[['amount']] 

# 2. Train Isolation Forest
# contamination=0.005 tells the model to look for the top 0.5% weirdest data points
model = IsolationForest(contamination=0.005, random_state=42)
model.fit(X)

# 3. Predict Anomalies
# The model creates its own predictions (-1 for anomaly, 1 for normal)
df_unlabeled['anomaly_score'] = model.predict(X)

# Check what the model found
print(df_unlabeled['anomaly_score'].value_counts())

anomaly_score
 1    995016
-1      4984
Name: count, dtype: int64
