In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class CarEnv(gym.Env):
    """
    Simple car speed control environment with improved physics and SAC-friendly rewards.
    Action: acceleration in [-5, 5] m/s²
    Observation: [current_speed, error_to_setpoint]
    Reward: -abs(error_to_setpoint) + bonus when close.
    """
    def __init__(self):
        super(CarEnv, self).__init__()
        self.setpoint = 60.0  # target speed (km/h)
        self.dt = 0.1         # timestep (s)
        self.max_steps = 500
        
        self.action_space = spaces.Box(low=-5.0, high=5.0, shape=(1,), dtype=np.float32)
        obs_low = np.array([0.0, -100.0], dtype=np.float32)
        obs_high = np.array([200.0, 100.0], dtype=np.float32)
        self.observation_space = spaces.Box(low=obs_low, high=obs_high, dtype=np.float32)
        
        self.reset()
        
    def step(self, action):
        accel = np.clip(float(action[0]), -5.0, 5.0)
        drag_coeff = 0.01
        rolling_resistance = 0.005
        drag_force = drag_coeff * self.v ** 2
        rolling_force = rolling_resistance * self.v
        self.v += (accel - drag_force - rolling_force) * self.dt
        self.v = max(0.0, self.v)
        error = self.setpoint - self.v
        
        reward = -abs(error)
        if abs(error) < 2.0:
            reward += 10.0  # Strong incentive to stay near setpoint
        
        self.step_count += 1
        terminated = self.step_count >= self.max_steps
        truncated = False
        
        obs = np.array([self.v, error], dtype=np.float32)
        info = {}
        return obs, reward, terminated, truncated, info
    
    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.v = np.random.uniform(0, 10)
        self.step_count = 0
        error = self.setpoint - self.v
        obs = np.array([self.v, error], dtype=np.float32)
        info = {}
        return obs, info


In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize

# Vectorize and normalize environment
env = make_vec_env(CarEnv, n_envs=1)
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

# Initialize SAC model
model = SAC("MlpPolicy", env, verbose=1, tensorboard_log="./sac_car_tb/")

# Train
model.learn(total_timesteps=200_000)

# Save model and normalization stats
model.save("sac_car")
env.save("sac_car_env.pkl")
env.close()