# AURA — Synthetic Dataset Generator
This notebook generates balanced synthetic datasets for August and September 2025.
- 60,000 rows for training (Aug 1–Sep 20)
- 12,000 rows for validation (Sep 21–30)
Includes realistic environmental patterns, daylight variation, and comfort labels.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
# Random seed for reproducibility
rng = np.random.default_rng(20251010)

### Context definitions

In [3]:

indoor_contexts = [
    "home","office","school","library","hospital","bar",
    "supermarket","church","gym","museum","pharmacy","classroom"
]
outdoor_contexts = [
    "park","playground","forest","beach","garden","square",
    "street","bus_stop","train_station","metro_station","parking","concert_area"
]


### Environmental baseline parameters for each context

In [4]:
ctx_params = {
    "home":        {"loc":"indoor","noise_mu":50,"noise_sd":8,"light_base":350,"crowd_mu":2,"crowd_sd":2},
    "office":      {"loc":"indoor","noise_mu":55,"noise_sd":8,"light_base":500,"crowd_mu":8,"crowd_sd":3},
    "school":      {"loc":"indoor","noise_mu":60,"noise_sd":9,"light_base":550,"crowd_mu":15,"crowd_sd":4},
    "library":     {"loc":"indoor","noise_mu":38,"noise_sd":5,"light_base":300,"crowd_mu":2,"crowd_sd":1},
    "hospital":    {"loc":"indoor","noise_mu":45,"noise_sd":6,"light_base":450,"crowd_mu":5,"crowd_sd":2},
    "bar":         {"loc":"indoor","noise_mu":80,"noise_sd":6,"light_base":600,"crowd_mu":15,"crowd_sd":4},
    "supermarket": {"loc":"indoor","noise_mu":70,"noise_sd":7,"light_base":700,"crowd_mu":12,"crowd_sd":4},
    "church":      {"loc":"indoor","noise_mu":42,"noise_sd":5,"light_base":350,"crowd_mu":6,"crowd_sd":3},
    "gym":         {"loc":"indoor","noise_mu":78,"noise_sd":7,"light_base":650,"crowd_mu":14,"crowd_sd":4},
    "museum":      {"loc":"indoor","noise_mu":48,"noise_sd":6,"light_base":400,"crowd_mu":8,"crowd_sd":3},
    "pharmacy":    {"loc":"indoor","noise_mu":50,"noise_sd":6,"light_base":500,"crowd_mu":6,"crowd_sd":3},
    "classroom":   {"loc":"indoor","noise_mu":62,"noise_sd":8,"light_base":550,"crowd_mu":18,"crowd_sd":4},
    "park":          {"loc":"outdoor","noise_mu":55,"noise_sd":8,"light_base":800,"crowd_mu":6,"crowd_sd":3},
    "playground":    {"loc":"outdoor","noise_mu":65,"noise_sd":8,"light_base":900,"crowd_mu":10,"crowd_sd":4},
    "forest":        {"loc":"outdoor","noise_mu":45,"noise_sd":6,"light_base":700,"crowd_mu":2,"crowd_sd":2},
    "beach":         {"loc":"outdoor","noise_mu":60,"noise_sd":8,"light_base":1000,"crowd_mu":8,"crowd_sd":4},
    "garden":        {"loc":"outdoor","noise_mu":50,"noise_sd":6,"light_base":850,"crowd_mu":3,"crowd_sd":2},
    "square":        {"loc":"outdoor","noise_mu":70,"noise_sd":9,"light_base":950,"crowd_mu":14,"crowd_sd":4},
    "street":        {"loc":"outdoor","noise_mu":78,"noise_sd":10,"light_base":950,"crowd_mu":18,"crowd_sd":4},
    "bus_stop":      {"loc":"outdoor","noise_mu":76,"noise_sd":9,"light_base":800,"crowd_mu":15,"crowd_sd":4},
    "train_station": {"loc":"outdoor","noise_mu":82,"noise_sd":9,"light_base":900,"crowd_mu":20,"crowd_sd":4},
    "metro_station": {"loc":"outdoor","noise_mu":85,"noise_sd":8,"light_base":600,"crowd_mu":18,"crowd_sd":4},
    "parking":       {"loc":"outdoor","noise_mu":65,"noise_sd":8,"light_base":850,"crowd_mu":5,"crowd_sd":3},
    "concert_area":  {"loc":"outdoor","noise_mu":98,"noise_sd":6,"light_base":1100,"crowd_mu":20,"crowd_sd":3},
}


### Daylight factor

In [5]:
def daylight_factor(dt):
    """Compute daylight intensity factor (0–1) by hour & month."""
    hour = dt.hour
    if 0 <= hour <= 5:   base = 0.1
    elif 6 <= hour <= 8: base = 0.45
    elif 9 <= hour <= 17:base = 1.0
    elif 18 <= hour <= 20:base = 0.55
    else:                base = 0.25

    # Monthly adjustment: brighter in August, dimmer in September
    if dt.month == 8:
        base *= 1.05
    else:
        base *= 0.95

    return float(np.clip(base + rng.normal(0, 0.03), 0.05, 1.1))


### Synthetic data generation

In [6]:
def generate_pool(start_date: datetime, days: int, n_rows: int, indoor_share=0.5):
    rows = []
    for _ in range(n_rows):
        day_offset = int(rng.integers(0, days))
        hour = int(rng.integers(0, 24))
        minute = int(rng.integers(0, 60))
        ts = start_date + timedelta(days=day_offset, hours=hour, minutes=minute)

        ctx = rng.choice(indoor_contexts if rng.random() < indoor_share else outdoor_contexts)
        p = ctx_params[ctx]; loc = p["loc"]

        # Noise (25–110 dB)
        n = float(np.clip(rng.normal(p["noise_mu"], p["noise_sd"]), 25, 110))
        # Crowd (0–20)
        c = int(np.clip(round(rng.normal(p["crowd_mu"], p["crowd_sd"])), 0, 20))

        # Light
        f = daylight_factor(ts)
        if loc == "indoor":
            base = p["light_base"] * (0.8 + 0.4*f)
            light = float(np.clip(rng.normal(base, p["light_base"]*0.15), 100, 900))
        else:
            night_floor = 120 if (ts.hour <= 5 or ts.hour >= 21) else 60
            base = p["light_base"]
            out_val = f*base + rng.normal(0, base*0.1)
            light = float(np.clip(out_val, night_floor, 1200))

        # Discomfort formula
        D_base = 0.5*(n/110.0) + 0.3*(light/1200.0) + 0.2*(c/20.0)
        penalty = (0.08*(n>85)) + (0.05*(light>900)) + (0.06*(c>12))
        D_final = float(np.clip(D_base + penalty, 0, 1))

        # Comfort label
        if D_final < 0.2: lab = "very_comfortable"
        elif D_final < 0.4: lab = "comfortable"
        elif D_final < 0.6: lab = "neutral"
        elif D_final < 0.8: lab = "uncomfortable"
        else: lab = "stressed"

        rows.append({
            "timestamp": ts.isoformat(timespec="minutes"),
            "location_type": loc,
            "context": ctx,
            "noise_db": round(n, 1),
            "light_lux": int(round(light)),
            "crowd_count": int(c),
            "discomfort_level": round(D_final, 3),
            "comfort_label": lab
        })
    return pd.DataFrame(rows)

### Balancing function

In [7]:

def balanced_sample(df, per_combo):
    labels = ["very_comfortable","comfortable","neutral","uncomfortable","stressed"]
    locs = ["indoor","outdoor"]
    parts = []
    for lab in labels:
        for loc in locs:
            grp = df[(df["comfort_label"]==lab) & (df["location_type"]==loc)]
            replace = len(grp) < per_combo
            parts.append(grp.sample(n=per_combo, replace=replace, random_state=42))
    return pd.concat(parts, axis=0).sample(frac=1, random_state=42).reset_index(drop=True)


### Generate train + validation datasets

In [8]:

start_aug = datetime(2025, 8, 1, 0, 0, 0)

train_pool = generate_pool(start_date=start_aug, days=51, n_rows=200_000, indoor_share=0.5)
val_start = datetime(2025, 9, 21, 0, 0, 0)
val_pool = generate_pool(start_date=val_start, days=10, n_rows=40_000, indoor_share=0.5)

train_df = balanced_sample(train_pool, per_combo=6000)   # 5 labels × 2 locs × 6000 = 60,000
val_df   = balanced_sample(val_pool, per_combo=1200)     # 5 × 2 × 1200 = 12,000

train_df.to_csv("AURA_aug_sep_60k.csv", index=False)
val_df.to_csv("AURA_validation_sep_12k.csv", index=False)

print("Datasets created")
print(train_df.shape, val_df.shape)
print(train_df['comfort_label'].value_counts())

Datasets created
(60000, 8) (12000, 8)
comfort_label
comfortable         12000
uncomfortable       12000
very_comfortable    12000
stressed            12000
neutral             12000
Name: count, dtype: int64
