In [None]:
# ===============================
# RL Cheat Sheet - Stable-Baselines3
# ===============================

# 1. Import Libraries
import gymnasium as gym                  # Gym environments
from stable_baselines3 import PPO, DQN, SAC
from stable_baselines3.common.evaluation import evaluate_policy

# ===============================
# 2. Create Environment
# ===============================
# Standard environment
env = gym.make("CartPole-v1")           # Discrete action environment
# Continuous action environment (for SAC)
# env = gym.make("Pendulum-v1")

# Custom Environment Example
"""
class MyEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(n_features,))
        self.action_space = gym.spaces.Discrete(n_actions) # or Box for continuous
    def reset(self):
        return self.observation_space.sample(), {}
    def step(self, action):
        obs = self.observation_space.sample()
        reward = ...  # define reward
        done = ...    # define episode end
        return obs, reward, done, False, {}
"""

# ===============================
# 3. Initialize Model
# ===============================
# PPO (on-policy, discrete/continuous)
ppo_model = PPO(
    "MlpPolicy", env,
    learning_rate=3e-4,       # learning rate for policy network
    n_steps=128,              # steps per rollout
    batch_size=64,            # mini-batch size
    gamma=0.99,               # discount factor
    verbose=1,
    tensorboard_log="./ppo_logs/"
)

# DQN (off-policy, discrete only)
dqn_model = DQN(
    "MlpPolicy", env,
    learning_rate=1e-3,       # Q-network learning rate
    buffer_size=50000,        # replay buffer size
    learning_starts=1000,     # steps before training starts
    batch_size=32,            # mini-batch size
    gamma=0.99,               # discount factor
    target_update_interval=500,  # update target network
    exploration_fraction=0.1,    # fraction of training for epsilon decay
    exploration_final_eps=0.05,  # final epsilon value
    verbose=1
)

# SAC (off-policy, continuous only)
# Note: SAC requires Box (continuous) action space
# sac_model = SAC(
#     "MlpPolicy", env,
#     learning_rate=3e-4,
#     buffer_size=50000,
#     batch_size=64,
#     gamma=0.99,
#     tau=0.005,  # soft update for target network
#     verbose=1
# )

# ===============================
# 4. Train Model
# ===============================
# total_timesteps = number of environment steps
ppo_model.learn(total_timesteps=50000)
dqn_model.learn(total_timesteps=50000)
# sac_model.learn(total_timesteps=50000)

# ===============================
# 5. Evaluate Model
# ===============================
# Evaluate over n episodes
mean_reward, std_reward = evaluate_policy(ppo_model, env, n_eval_episodes=10)
# same for dqn_model, sac_model
print(f"PPO mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# ===============================
# 6. Test / Run Agent
# ===============================
obs, _ = env.reset()
for _ in range(100):
    # Deterministic actions (choose best)
    action, _ = ppo_model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    print(f"Action: {action}, Reward: {reward}")
    if done or truncated:
        obs, _ = env.reset()

# ===============================
# 7. Save and Load Model
# ===============================
ppo_model.save("ppo_model")               # save model
loaded_model = PPO.load("ppo_model", env=env)  # load model

# ===============================
# 8. Important Parameters Summary
# ===============================
"""
Common RL Parameters:

- learning_rate: how fast the agent updates NN weights
- gamma: discount factor for future rewards
- n_steps: PPO rollout steps
- batch_size: mini-batch size for training
- buffer_size: replay buffer size (DQN/SAC)
- learning_starts: steps before training begins (DQN/SAC)
- target_update_interval: how often to update target network (DQN)
- tau: soft update rate (SAC)
- exploration_fraction: fraction of training for epsilon decay (DQN)
- exploration_final_eps: final epsilon (DQN)
"""

# ===============================
# 9. Exploration / Strategy
# ===============================
"""
- PPO: stochastic policy, exploration intrinsic
- DQN: epsilon-greedy
- SAC: stochastic policy with entropy bonus (encourages exploration)
"""

# ===============================
# 10. Notes
# ===============================
"""
- PPO: stable, works for discrete & continuous actions
- DQN: discrete actions only, uses replay buffer & target network
- SAC: continuous actions, off-policy, sample efficient, uses soft target updates
- Always normalize / preprocess observations for stable learning
- Use small total_timesteps initially for testing
"""


In [None]:
                          +-----------------+
                          |   Environment   |
                          |   (Gym/Custom)  |
                          +--------+--------+
                                   |
          ------------------------------------------------
          |                       |                     |
       [PPO]                   [DQN]                  [SAC]
          |                       |                     |
  +-------+--------+      +-------+--------+    +-------+--------+
  | Policy Network |      | Q-Network      |    | Actor Network  |
  | (MLP/CNN)     |      | (MLP/CNN)      |    | (MLP/CNN)      |
  +-------+--------+      +-------+--------+    +-------+--------+
          |                       |                     |
          |                       |                     |
      Action Sampling         Îµ-greedy / Policy       Stochastic Policy
      (stochastic)            from Q-values           + Entropy Bonus
          |                       |                     |
          +-----------+-----------+-----------+---------+
                      |
                   Step in Environment
                      |
              +-------+--------+
              | Observe Next   |
              | State & Reward |
              +-------+--------+
                      |
      ------------------------------------------
      |                       |                |
    PPO: Rollout           DQN: Store in     SAC: Store in
    n_steps trajectories   Replay Buffer      Replay Buffer
      |                       |                |
   Compute Advantage     Sample Mini-Batch   Sample Mini-Batch
      |                       |                |
   Policy Gradient       Update Q-network   Update Actor + Critic
   Loss & Backprop       Loss & Backprop   Loss & Backprop
      |                       |                |
      +-----------+-----------+----------------+
                  |
           Update Networks
       (Target network for DQN/SAC)
                  |
           Repeat Until Done
                  |
           Evaluate / Save / Test
