In [None]:
import numpy as np
import pandas as pd

# ======================================================
# Configuration (Simulation + Dataset Split)
# ======================================================
# np.random.seed(42)  # Uncomment for reproducibility

# Spatial domain
X_MIN, X_MAX = -3.0, 3.0
Y_MIN, Y_MAX = 0.1, 3.6

# Temporal sampling (match recordings)
FPS = 4
DT  = 1.0 / FPS   # 0.25 s

# Durations
TRAIN_DURATION_S = 7200          # 2h
VAL_DURATION_S   = 1800          # 30m
TOTAL_SECONDS    = TRAIN_DURATION_S + VAL_DURATION_S
N_FRAMES         = int(TOTAL_SECONDS * FPS)

# Motion parameters
SPEED_MIN     = 0.0    # 0 .. 5 m/s as requested
SPEED_MAX     = 5.0
SPEED_SIGMA   = 0.02   # jitter (kept same)
HEADING_SIGMA = 0.02
P_TURN        = 0.01
MAX_TURN      = np.pi / 4
P_PAUSE       = 0.005
PAUSE_MIN     = 2      # seconds
PAUSE_MAX     = 20     # seconds

# Spawn cadence (increase spawn rate 7.5× vs old 30 Hz setup)
SPAWN_MIN_BASE = 2.0   # seconds (old baseline)
SPAWN_MAX_BASE = 8.0
SPAWN_RATE_BOOST = 7.5
SPAWN_MIN     = SPAWN_MIN_BASE / SPAWN_RATE_BOOST
SPAWN_MAX     = SPAWN_MAX_BASE / SPAWN_RATE_BOOST

# Future horizon (1 second ahead only)
HORIZON_1_S = 1.0
H1_FRAMES   = int(round(HORIZON_1_S * FPS))  # 4 frames at 4 Hz

# ======================================================
# Simulation State
# ======================================================
records = []
agents = {}
next_spawn_time = 0.0
next_id = 1

def spawn_pedestrian(t):
    global next_id, next_spawn_time
    pid = next_id
    next_id += 1
    next_spawn_time = t + np.random.uniform(SPAWN_MIN, SPAWN_MAX)

    edge = np.random.choice(['left', 'right', 'bottom', 'top'])
    if edge == 'left':
        x, y = X_MIN, np.random.uniform(Y_MIN, Y_MAX)
        heading = np.random.uniform(-np.pi/2, np.pi/2)
    elif edge == 'right':
        x, y = X_MAX, np.random.uniform(Y_MIN, Y_MAX)
        heading = np.random.uniform(np.pi/2, 3*np.pi/2)
    elif edge == 'bottom':
        x, y = np.random.uniform(X_MIN, X_MAX), Y_MIN
        heading = np.random.uniform(0, np.pi)
    else:  # top
        x, y = np.random.uniform(X_MIN, X_MAX), Y_MAX
        heading = np.random.uniform(-np.pi, 0)

    speed = np.random.uniform(SPEED_MIN, SPEED_MAX)
    agents[pid] = {
        'x': x, 'y': y,
        'heading': heading,
        'speed': speed,
        'first_frame': True,
        'pause_active': False,
        'pause_remaining': 0,
        'vx': 0.0, 'vy': 0.0
    }

# ======================================================
# Run Simulation
# ======================================================
for frame in range(N_FRAMES):
    t = frame * DT
    if t >= next_spawn_time:
        spawn_pedestrian(t)

    for pid in list(agents):
        a = agents[pid]

        # Skip first frame (no prior displacement for velocity)
        if a['first_frame']:
            a['first_frame'] = False
            continue

        if a['pause_active']:
            a['pause_remaining'] -= 1
            if a['pause_remaining'] <= 0:
                a['pause_active'] = False
            a['vx'] = 0.0
            a['vy'] = 0.0
            a['speed'] = 0.0

        elif np.random.rand() < P_PAUSE:
            a['pause_active'] = True
            a['pause_remaining'] = np.random.randint(PAUSE_MIN * FPS, PAUSE_MAX * FPS)
            a['vx'] = 0.0
            a['vy'] = 0.0
            a['speed'] = 0.0

        else:
            # Heading diffusion
            a['heading'] += np.random.normal(0, HEADING_SIGMA)
            # Occasional sharp turn
            if np.random.rand() < P_TURN:
                a['heading'] += np.random.uniform(-MAX_TURN, MAX_TURN)
            # Speed jitter
            a['speed'] += np.random.normal(0, SPEED_SIGMA)
            a['speed'] = np.clip(a['speed'], SPEED_MIN, SPEED_MAX)
            # Velocity components
            a['vx'] = a['speed'] * np.cos(a['heading'])
            a['vy'] = a['speed'] * np.sin(a['heading'])
            # Integrate
            a['x'] += a['vx'] * DT
            a['y'] += a['vy'] * DT

        # Boundary removal
        if not (X_MIN <= a['x'] <= X_MAX and Y_MIN <= a['y'] <= Y_MAX):
            del agents[pid]
            continue

        records.append({
            'time_s': t,
            'ped_id': pid,
            'x': a['x'],
            'y': a['y'],
            'vx': a['vx'],
            'vy': a['vy']
        })

full_df = pd.DataFrame(records)

# Add scalar speed
full_df['v'] = np.hypot(full_df['vx'], full_df['vy'])

# ======================================================
# Concurrency Diagnostics (optional)
# ======================================================
concurrency = full_df.groupby('time_s')['ped_id'].nunique()
print("=== Concurrency Summary ===")
print("Frames logged:", len(concurrency))
print("Mean concurrent pedestrians: {:.3f}".format(concurrency.mean()))
print("Std concurrent pedestrians:  {:.3f}".format(concurrency.std()))
print("Max concurrent pedestrians:", concurrency.max())
print("Frames with >=2 pedestrians: {:.1%}".format((concurrency >= 2).mean()))
print("Frames with >=3 pedestrians: {:.1%}".format((concurrency >= 3).mean()))
multi_frames = concurrency[concurrency >= 2].index
if len(multi_frames):
    t_example = multi_frames[0]
    print(f"\nExample multi-ped frame t={t_example:.2f}s:")
    print(full_df[full_df.time_s == t_example][['ped_id','x','y','vx','vy','v']].head())
else:
    print("\nNo multi-ped frames found (consider adjusting spawn intervals).")

assert full_df.duplicated(subset=['time_s','ped_id']).sum() == 0, "Duplicate (time_s,ped_id) rows!"

# ======================================================
# Split Logic
# ======================================================
ped_groups = full_df.groupby('ped_id')['time_s']
ped_start = ped_groups.min()
ped_end   = ped_groups.max()

TRAIN_END = TRAIN_DURATION_S
VAL_START = TRAIN_END
VAL_END   = TOTAL_SECONDS

cross_boundary = ped_start[(ped_start < TRAIN_END) & (ped_end >= TRAIN_END)].index
train_peds = ped_start[(ped_start >= 0) & (ped_end < TRAIN_END)].index.difference(cross_boundary)
val_peds   = ped_start[(ped_start >= VAL_START) & (ped_end < VAL_END)].index

train_df = full_df[(full_df.ped_id.isin(train_peds)) & (full_df.time_s < TRAIN_END)].copy()
val_df   = full_df[(full_df.ped_id.isin(val_peds))   & (full_df.time_s >= VAL_START)].copy()

# Reset validation time to start at 0
val_df['time_s'] = val_df['time_s'] - VAL_START

# Relabel validation IDs starting at 1
val_first = val_df.groupby('ped_id')['time_s'].min().sort_values()
val_id_map = {old:new for new,old in enumerate(val_first.index, start=1)}
val_df['ped_id'] = val_df['ped_id'].map(val_id_map)

# Sort
train_df = train_df.sort_values(['ped_id','time_s']).reset_index(drop=True)
val_df   = val_df.sort_values(['ped_id','time_s']).reset_index(drop=True)

# ======================================================
# Add 1-second Future Targets (no 2s horizon)
# ======================================================
def add_future_1s(df, h1_frames):
    df = df.copy()
    df['next1_x'] = df.groupby('ped_id')['x'].shift(-h1_frames)
    df['next1_y'] = df.groupby('ped_id')['y'].shift(-h1_frames)
    # Keep rows even if next1_* is NaN (edge of sequence)
    return df

train_df = add_future_1s(train_df, H1_FRAMES)
val_df   = add_future_1s(val_df,   H1_FRAMES)

# ======================================================
# Column order
# ======================================================
cols = ['time_s','ped_id','x','y','vx','vy','v','next1_x','next1_y']
train_df = train_df[cols]
val_df   = val_df[cols]

# ======================================================
# Save
# ======================================================
train_df.to_csv('TrainingSet.csv', index=False)
val_df.to_csv('ValidationSet.csv', index=False)

# ======================================================
# Summary
# ======================================================
print("\n=== Split + 1s Target Summary (4 Hz) ===")
print(f"Cross-boundary pedestrians removed: {len(cross_boundary)}")
print(f"Training pedestrians: {train_df.ped_id.nunique()} | Rows kept: {len(train_df)}")
print(f"Validation pedestrians: {val_df.ped_id.nunique()} | Rows kept: {len(val_df)}")
print("Columns:", cols)
print("\nTrain sample:")
print(train_df.head())
print("\nValidation sample:")
print(val_df.head())

# Optional: velocity finite-difference verification (spot check first 5 rows of train_df)
def check_vel(df, n=5):
    print("\nVelocity finite-diff spot check:")
    grp = df.groupby('ped_id')
    shown = 0
    for pid, g in grp:
        g = g.sort_values('time_s')
        if len(g) < 3:
            continue
        for i in range(min(n, len(g)-1)):
            dt = g.time_s.iloc[i+1] - g.time_s.iloc[i]
            dx = g.x.iloc[i+1] - g.x.iloc[i]
            dy = g.y.iloc[i+1] - g.y.iloc[i]
            vx_fd = dx / dt
            vy_fd = dy / dt
            print(f"pid {pid} row {i}: stored vx={g.vx.iloc[i]:.4f} fd={vx_fd:.4f} | stored vy={g.vy.iloc[i]:.4f} fd={vy_fd:.4f}")
        shown += 1
        if shown >= 1:
            break

check_vel(train_df)


=== Concurrency Summary ===
Frames logged: 35982
Mean concurrent pedestrians: 6.663
Std concurrent pedestrians:  2.437
Max concurrent pedestrians: 16
Frames with >=2 pedestrians: 99.3%
Frames with >=3 pedestrians: 96.8%

Example multi-ped frame t=1.00s:
   ped_id         x         y        vx        vy         v
3       1  0.731877  2.890720 -2.248425  0.157230  2.253916
4       2 -1.955805  0.908754 -2.033464  3.235015  3.821034

=== Split + 1s Target Summary (4 Hz) ===
Cross-boundary pedestrians removed: 7
Training pedestrians: 8155 | Rows kept: 188205
Validation pedestrians: 2039 | Rows kept: 49959
Columns: ['time_s', 'ped_id', 'x', 'y', 'vx', 'vy', 'v', 'next1_x', 'next1_y']

Train sample:
   time_s  ped_id         x         y        vx        vy         v   next1_x  \
0    0.25       1  2.426735  2.733850 -2.293059  0.272741  2.309222  0.176051   
1    0.50       1  1.859211  2.801209 -2.270099  0.269436  2.286033 -0.380650   
2    0.75       1  1.293984  2.851413 -2.260907  0.200