# Samping raw data to simulate data used for dev, stag and prod

split the chronological timeline into Dev (oldest) → Staging (recent past) → Prod (most recent). Each environment can then be used differently.

split the 6.36M rows into:
- Dev → ~70% oldest steps
- Staging → ~15% middle steps
- Prod → ~15% newest steps

In [37]:
import pandas as pd
from pathlib import Path

In [38]:
RAW_PATH = Path("../data/raw")
CREDITCARD_PATH = RAW_PATH / "paysim_data.csv"

df = pd.read_csv(CREDITCARD_PATH)

In [39]:
df['isFraud'].value_counts() #hightly imbalanced

isFraud
0    6354407
1       8213
Name: count, dtype: int64

In [40]:
# Perfect 50:50 balanced sampling strategy
from sklearn.model_selection import train_test_split
import numpy as np

print(f"Total data: {len(df)} rows")
print(f"Fraud cases available: {df['isFraud'].sum()}")
print(f"Overall fraud rate: {df['isFraud'].mean():.4f}")

# Separate fraud and non-fraud cases
fraud_cases = df[df['isFraud'] == 1].copy()
non_fraud_cases = df[df['isFraud'] == 0].copy()

print(f"\nSeparated data:")
print(f"Fraud cases: {len(fraud_cases)}")
print(f"Non-fraud cases: {len(non_fraud_cases)}")

# Strategy: 50:50 balance for all environments
# Distribute fraud cases across environments
fraud_dev_size = 2000      # 2k fraud cases for dev
fraud_staging_size = 1500  # 1.5k fraud cases for staging  
fraud_prod_size = len(fraud_cases) - fraud_dev_size - fraud_staging_size  # Remaining ~4.7k fraud cases

print(f"\nPlanned fraud distribution:")
print(f"Dev: {fraud_dev_size} fraud cases")
print(f"Staging: {fraud_staging_size} fraud cases")
print(f"Prod: {fraud_prod_size} fraud cases")

# Split fraud cases
fraud_dev, fraud_temp = train_test_split(
    fraud_cases, 
    train_size=fraud_dev_size, 
    random_state=42
)

fraud_staging, fraud_prod = train_test_split(
    fraud_temp, 
    train_size=fraud_staging_size, 
    random_state=43
)

print(f"\nActual fraud distribution:")
print(f"Dev: {len(fraud_dev)} fraud cases")
print(f"Staging: {len(fraud_staging)} fraud cases") 
print(f"Prod: {len(fraud_prod)} fraud cases")

# For 50:50 balance, non-fraud cases = fraud cases
non_fraud_dev_size = len(fraud_dev)      # 2000 non-fraud
non_fraud_staging_size = len(fraud_staging)  # 1500 non-fraud
non_fraud_prod_size = len(fraud_prod)    # ~4713 non-fraud

print(f"\nNon-fraud distribution (50:50 balance):")
print(f"Dev: {non_fraud_dev_size} non-fraud cases")
print(f"Staging: {non_fraud_staging_size} non-fraud cases")
print(f"Prod: {non_fraud_prod_size} non-fraud cases")

# Split non-fraud cases
non_fraud_dev, non_fraud_temp = train_test_split(
    non_fraud_cases, 
    train_size=non_fraud_dev_size, 
    random_state=42
)

non_fraud_staging, non_fraud_prod = train_test_split(
    non_fraud_temp, 
    train_size=non_fraud_staging_size, 
    random_state=43
)

# Combine fraud + non-fraud for each environment (50:50 balance)
dev_df = pd.concat([fraud_dev, non_fraud_dev], ignore_index=True)
staging_df = pd.concat([fraud_staging, non_fraud_staging], ignore_index=True)
prod_df = pd.concat([fraud_prod, non_fraud_prod], ignore_index=True)

# Shuffle each dataset
dev_df = dev_df.sample(frac=1, random_state=42).reset_index(drop=True)
staging_df = staging_df.sample(frac=1, random_state=43).reset_index(drop=True)
prod_df = prod_df.sample(frac=1, random_state=44).reset_index(drop=True)

# Save files
dev_df.to_csv(RAW_PATH / "paysim-dev.csv", index=False)
staging_df.to_csv(RAW_PATH / "paysim-staging.csv", index=False)
prod_df.to_csv(RAW_PATH / "paysim-prod.csv", index=False)

print(f"\n=== FINAL DATASET SUMMARY (50:50 BALANCED) ===")
print(f"Dev: {len(dev_df):,} total rows")
print(f"  - Fraud: {dev_df['isFraud'].sum():,} cases ({dev_df['isFraud'].mean():.4f} rate)")
print(f"  - Non-fraud: {(dev_df['isFraud'] == 0).sum():,} cases")

print(f"\nStaging: {len(staging_df):,} total rows") 
print(f"  - Fraud: {staging_df['isFraud'].sum():,} cases ({staging_df['isFraud'].mean():.4f} rate)")
print(f"  - Non-fraud: {(staging_df['isFraud'] == 0).sum():,} cases")

print(f"\nProd: {len(prod_df):,} total rows")
print(f"  - Fraud: {prod_df['isFraud'].sum():,} cases ({prod_df['isFraud'].mean():.4f} rate)")
print(f"  - Non-fraud: {(prod_df['isFraud'] == 0).sum():,} cases")

print(f"\nTotal fraud cases used: {dev_df['isFraud'].sum() + staging_df['isFraud'].sum() + prod_df['isFraud'].sum():,}/{len(fraud_cases):,}")

# Verify perfect balance
print(f"\n=== BALANCE VERIFICATION ===")
print(f"Dev fraud rate: {dev_df['isFraud'].mean():.4f} (should be 0.5000)")
print(f"Staging fraud rate: {staging_df['isFraud'].mean():.4f} (should be 0.5000)")
print(f"Prod fraud rate: {prod_df['isFraud'].mean():.4f} (should be 0.5000)")

Total data: 6362620 rows
Fraud cases available: 8213
Overall fraud rate: 0.0013

Separated data:
Fraud cases: 8213
Non-fraud cases: 6354407

Planned fraud distribution:
Dev: 2000 fraud cases
Staging: 1500 fraud cases
Prod: 4713 fraud cases

Actual fraud distribution:
Dev: 2000 fraud cases
Staging: 1500 fraud cases
Prod: 4713 fraud cases

Non-fraud distribution (50:50 balance):
Dev: 2000 non-fraud cases
Staging: 1500 non-fraud cases
Prod: 4713 non-fraud cases

Separated data:
Fraud cases: 8213
Non-fraud cases: 6354407

Planned fraud distribution:
Dev: 2000 fraud cases
Staging: 1500 fraud cases
Prod: 4713 fraud cases

Actual fraud distribution:
Dev: 2000 fraud cases
Staging: 1500 fraud cases
Prod: 4713 fraud cases

Non-fraud distribution (50:50 balance):
Dev: 2000 non-fraud cases
Staging: 1500 non-fraud cases
Prod: 4713 non-fraud cases

=== FINAL DATASET SUMMARY (50:50 BALANCED) ===
Dev: 4,000 total rows
  - Fraud: 2,000 cases (0.5000 rate)
  - Non-fraud: 2,000 cases

Staging: 3,000 tota

df = df.sample(n=30000, random_state=42)  # Take random 30k rows for speed
df.head()
len(df)

# Best of both worlds: small dev, balanced classes, time-aware
from sklearn.model_selection import train_test_split

# Overall fraud rate for reference
overall_fraud_rate = df['isFraud'].mean()
print(f"Overall fraud rate: {overall_fraud_rate:.4f}")

# Dev: Small stratified sample (fast development)
dev_df, remaining_df = train_test_split(
    df, 
    train_size=2000,  # Small for fast iteration
    stratify=df['isFraud'], 
    random_state=42
)

# Split remaining into staging and prod
staging_df, prod_df = train_test_split(
    remaining_df, 
    train_size=5000,  # Medium for validation
    stratify=remaining_df['isFraud'], 
    random_state=43
)

# Save files
dev_df.to_csv(RAW_PATH / "paysim-dev.csv", index=False)
staging_df.to_csv(RAW_PATH / "paysim-staging.csv", index=False)
prod_df.to_csv(RAW_PATH / "paysim-prod.csv", index=False)

print(f"Dev: {len(dev_df)} rows (fast development)")
print(f"Staging: {len(staging_df)} rows (validation)")  
print(f"Prod: {len(prod_df)} rows (production)")
print(f"Fraud rates: {dev_df['isFraud'].mean():.4f}, {staging_df['isFraud'].mean():.4f}, {prod_df['isFraud'].mean():.4f}")

To avoid data leakage & keep fraud class balance:
Use stratified sampling on the target column (Class) so the fraud ratio stays the same.

# Check unique step values (time simulation)
print("Step range:", df["step"].min(), "to", df["step"].max())

# Split by quantiles on step
q1 = df["step"].quantile(0.70)   # 70% cutoff
q2 = df["step"].quantile(0.85)   # 85% cutoff

dev_df = df[df["step"] <= q1].copy()
staging_df = df[(df["step"] > q1) & (df["step"] <= q2)].copy()
prod_df = df[df["step"] > q2].copy()

dev_df.to_csv(RAW_PATH / "paysim-dev.csv", index=False)
staging_df.to_csv(RAW_PATH / "paysim-staging.csv", index=False)
prod_df.to_csv(RAW_PATH / "paysim-prod.csv", index=False)

print(f"Dev: {len(dev_df)} rows, Staging: {len(staging_df)} rows, Prod: {len(prod_df)} rows")
print("Fraud rates:", dev_df['isFraud'].mean(), staging_df['isFraud'].mean(), prod_df['isFraud'].mean())

In [41]:
print("Fraud rates across environments:")
print(f"Dev: {dev_df['isFraud'].mean():.4f}")
print(f"Staging: {staging_df['isFraud'].mean():.4f}")  
print(f"Prod: {prod_df['isFraud'].mean():.4f}")

Fraud rates across environments:
Dev: 0.5000
Staging: 0.5000
Prod: 0.0007


In [42]:
# then push the data to s3 via dvc
eg
dvc add data/raw/paysim-prod.csv
dvc push

SyntaxError: invalid syntax (4031129170.py, line 3)