# Samping raw data to simulate data used for dev, stag and prod

split the chronological timeline into Dev (oldest) → Staging (recent past) → Prod (most recent). Each environment can then be used differently.

split the 6.36M rows into:
- Dev → ~70% oldest steps
- Staging → ~15% middle steps
- Prod → ~15% newest steps

In [12]:
import pandas as pd
from pathlib import Path

In [13]:
RAW_PATH = Path("../data/raw")
CREDITCARD_PATH = RAW_PATH / "paysim_data.csv"

df = pd.read_csv(CREDITCARD_PATH)
df = df.sample(n=30000, random_state=42)  # Take random 30k rows for speed
df.head()
len(df)

30000

In [14]:
# Best of both worlds: small dev, balanced classes, time-aware
from sklearn.model_selection import train_test_split

# Overall fraud rate for reference
overall_fraud_rate = df['isFraud'].mean()
print(f"Overall fraud rate: {overall_fraud_rate:.4f}")

# Dev: Small stratified sample (fast development)
dev_df, remaining_df = train_test_split(
    df, 
    train_size=2000,  # Small for fast iteration
    stratify=df['isFraud'], 
    random_state=42
)

# Split remaining into staging and prod
staging_df, prod_df = train_test_split(
    remaining_df, 
    train_size=5000,  # Medium for validation
    stratify=remaining_df['isFraud'], 
    random_state=43
)

# Save files
dev_df.to_csv(RAW_PATH / "paysim-dev.csv", index=False)
staging_df.to_csv(RAW_PATH / "paysim-staging.csv", index=False)
prod_df.to_csv(RAW_PATH / "paysim-prod.csv", index=False)

print(f"Dev: {len(dev_df)} rows (fast development)")
print(f"Staging: {len(staging_df)} rows (validation)")  
print(f"Prod: {len(prod_df)} rows (production)")
print(f"Fraud rates: {dev_df['isFraud'].mean():.4f}, {staging_df['isFraud'].mean():.4f}, {prod_df['isFraud'].mean():.4f}")

Overall fraud rate: 0.0016
Dev: 2000 rows (fast development)
Staging: 5000 rows (validation)
Prod: 23000 rows (production)
Fraud rates: 0.0015, 0.0016, 0.0016


To avoid data leakage & keep fraud class balance:
Use stratified sampling on the target column (Class) so the fraud ratio stays the same.

# Check unique step values (time simulation)
print("Step range:", df["step"].min(), "to", df["step"].max())

# Split by quantiles on step
q1 = df["step"].quantile(0.70)   # 70% cutoff
q2 = df["step"].quantile(0.85)   # 85% cutoff

dev_df = df[df["step"] <= q1].copy()
staging_df = df[(df["step"] > q1) & (df["step"] <= q2)].copy()
prod_df = df[df["step"] > q2].copy()

dev_df.to_csv(RAW_PATH / "paysim-dev.csv", index=False)
staging_df.to_csv(RAW_PATH / "paysim-staging.csv", index=False)
prod_df.to_csv(RAW_PATH / "paysim-prod.csv", index=False)

print(f"Dev: {len(dev_df)} rows, Staging: {len(staging_df)} rows, Prod: {len(prod_df)} rows")
print("Fraud rates:", dev_df['isFraud'].mean(), staging_df['isFraud'].mean(), prod_df['isFraud'].mean())

In [15]:
print("Fraud rates across environments:")
print(f"Dev: {dev_df['isFraud'].mean():.4f}")
print(f"Staging: {staging_df['isFraud'].mean():.4f}")  
print(f"Prod: {prod_df['isFraud'].mean():.4f}")

Fraud rates across environments:
Dev: 0.0015
Staging: 0.0016
Prod: 0.0016


In [16]:
# then push the data to s3 via dvc
eg
dvc add data/raw/paysim-prod.csv
dvc push

SyntaxError: invalid syntax (4031129170.py, line 3)