In [1]:
# generate_stress_synthetic.py
import csv
from datetime import datetime, timedelta
import random
import uuid
import math

OUTFILE = "synthetic_auth_logs_stress.csv"

# ---- CONFIG: tune these for the stress tests ----
NUM_USERS = 500            # how many identities
DAYS = 30                  # days to simulate
EVENTS_PER_DAY = 6         # avg events per user per day
ANOMALY_RATIO = 0.02       # 2% of users will be compromised
SUBTLE_ATTACK_PROB = 0.7   # proportion of attacks that are subtle (not dramatic)
NORMAL_NOISE_TRAVEL_P = 0.05   # chance a normal event is from a different country
NORMAL_NOISE_DEVICE_P = 0.02   # chance a normal event uses a new device occasionally
FAILED_LOGIN_BASE = 0.02   # baseline failed login probability for normal events
SEED = 42
# -------------------------------------------------

random.seed(SEED)

countries = ["India","USA","Germany","Brazil","China","Canada","UK","Australia"]
device_types = ["Laptop","Desktop","Mobile","Tablet"]
resources = ["email","hr_portal","code_repo","wiki","finance_dashboard","payroll.csv","admin_console"]
auth_methods = ["password","mfa"]

# create user profiles
users = {}
for i in range(NUM_USERS):
    uid = f"user{i+1}@example.com"
    home_country = random.choice(countries)
    # each user has between 1 and 3 known devices
    known_devices = [f"{uid}_dev{j}" for j in range(1, random.choice([2,3,4]))]
    favorite_resources = random.sample(resources, k=random.choice([2,3,4]))
    mean_hour = random.choice([8,9,10,11,12,13,14,15,16,17])
    hour_std = random.choice([0.5, 1.0, 1.5, 2.0])
    users[uid] = {
        "home_country": home_country,
        "known_devices": known_devices,
        "fav_resources": favorite_resources,
        "mean_hour": mean_hour,
        "hour_std": hour_std
    }

# choose compromised users
num_compromised = max(1, int(NUM_USERS * ANOMALY_RATIO))
compromised_users = set(random.sample(list(users.keys()), num_compromised))
print(f"Compromised users: {len(compromised_users)} / {NUM_USERS}")

rows = []
start = datetime.now() - timedelta(days=DAYS)

for d in range(DAYS):
    for uid, profile in users.items():
        # events per day may vary a little
        num_events = max(1, int(random.gauss(EVENTS_PER_DAY, 1)))
        for _ in range(num_events):
            # sample hour using user's distribution
            hour = int(random.gauss(profile["mean_hour"], profile["hour_std"]))
            hour = max(0, min(23, hour))
            minute = random.randint(0,59)
            second = random.randint(0,59)
            ts = (start + timedelta(days=d, hours=hour, minutes=minute, seconds=second)).isoformat()

            # base fields
            country = profile["home_country"]
            device = random.choice(profile["known_devices"])
            resource = random.choice(profile["fav_resources"])
            success = 1 if random.random() > FAILED_LOGIN_BASE else 0
            auth_method = random.choice(auth_methods)
            is_attack = 0

            # Add normal noise: occasional travel or random device
            if uid not in compromised_users:
                if random.random() < NORMAL_NOISE_TRAVEL_P:
                    # occasional legitimate travel to another country
                    country = random.choice([c for c in countries if c != profile["home_country"]])
                if random.random() < NORMAL_NOISE_DEVICE_P:
                    # legitimate new device used occasionally
                    device = f"{uid}_tempdevice_{random.randint(1000,9999)}"
            else:
                # compromised user: inject attack behavior with probability per event
                # some attacks are subtle, some are overt
                attack_roll = random.random()
                is_attack_event = (random.random() < 0.3)  # only some events are attacker-driven
                if is_attack_event:
                    is_attack = 1
                    if attack_roll < SUBTLE_ATTACK_PROB:
                        # subtle attack: small deviations
                        # - slightly shifted login hour (e.g., 2-4 hours)
                        shift = random.choice([-3,-2,2,3])
                        shifted_hour = max(0, min(23, hour + shift))
                        ts = (start + timedelta(days=d, hours=shifted_hour, minutes=minute, seconds=second)).isoformat()
                        # - access an atypical resource (low probability from outside fav list)
                        if random.random() < 0.6:
                            resource = random.choice([r for r in resources if r not in profile["fav_resources"]])
                        # - slight increase in failures on some events
                        if random.random() < 0.2:
                            success = 0
                    else:
                        # overt attack: new country, unknown device, jump to sensitive resource, high success
                        country = random.choice([c for c in countries if c != profile["home_country"]])
                        device = f"ATTACKER_DEVICE_{random.randint(1000,9999)}"
                        resource = "payroll.csv"
                        # credential stuffing pattern: several failures then success
                        # approximate here: make success mostly 1 but occasionally 0
                        success = 1 if random.random() > 0.1 else 0
                        auth_method = "password"

            rows.append({
                "event_id": str(uuid.uuid4()),
                "timestamp": ts,
                "user": uid,
                "country": country,
                "device": device,
                "resource": resource,
                "success": success,
                "auth_method": auth_method,
                "is_attack": is_attack
            })

# write CSV
fieldnames = ["event_id","timestamp","user","country","device","resource","success","auth_method","is_attack"]
with open(OUTFILE, "w", newline="") as f:
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in rows:
        w.writerow(r)

print("Wrote", OUTFILE, "with", len(rows), "rows")


Compromised users: 10 / 500
Wrote synthetic_auth_logs_stress.csv with 82556 rows
