In [8]:
import gym
from gym import spaces
import numpy as np
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from imitation.data import rollout
from imitation.algorithms import bc
from imitation.util import logger

class CustomEnv(gym.Env):
    def __init__(self, S, A):
        self.S = torch.tensor(S, dtype=torch.float32)
        self.A = torch.tensor(A, dtype=torch.float32)
        self.num_samples = self.S.shape[0]
        self.current_step = 0

        # Action space
        action_dim = self.A.shape[1]
        self.action_space = spaces.Box(low=-1e6, high=1e6, shape=(action_dim,), dtype=np.float32)

        # Observation space
        obs_dim_S = self.S.shape[1]
        observation_space = {}
        for i in range(obs_dim_S):
            observation_space[f'S_{i+1}'] = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32)

        self.observation_space = spaces.Dict(observation_space)

    def reset(self):
        self.current_step = 0
        return self._get_observation()

    def step(self, action):
        next_state = self._get_observation()
        reward = self._calculate_reward(action)
        done = (self.current_step == self.num_samples - 1)
        self.current_step = (self.current_step + 1) % self.num_samples
        return next_state, reward, done, {}

    def _get_observation(self):
        obs = {}
        for i in range(self.S.shape[1]):
            obs[f'S_{i+1}'] = np.array([self.S[self.current_step, i]], dtype=np.float32)
        return obs

    def _calculate_reward(self, action):
        reward = torch.sum(self.A[self.current_step] * action)
        return reward.item()

# # 假設 S 和 A 是你的狀態和動作矩陣
# S = np.random.rand(1000, 9)
# A = np.random.rand(1000, 45)

# # 創建自訂環境
# env = CustomEnv(S, A)
# vec_env = DummyVecEnv([lambda: env])
# # 使用 PPO 訓練一個專家模型
# expert_model = PPO("MultiInputPolicy", vec_env, verbose=1)
# expert_model.learn(total_timesteps=10000)



In [6]:
from stable_baselines3.common.evaluation import evaluate_policy
evaluate_policy(expert_model, vec_env, 10,return_episode_rewards=True)

([2553.184532403946,
  2553.184532403946,
  2553.184532403946,
  2553.184532403946,
  2553.184532403946,
  2553.184532403946,
  2553.184532403946,
  2553.184532403946,
  2553.184532403946,
  2553.184532403946],
 [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])

In [67]:
from typing import Dict, Optional
from typing import Any
import numpy as np
import gymnasium as gym

from gymnasium.spaces import Box

num_observation = S.shape[1]
num_action = A.shape[1]

# class ObservationMatchingEnv(gym.Env):
#     def __init__(self, num_observation=num_observation,num_action=num_action):
#         self.state = None
#         self.num_observation = num_observation
#         self.num_action = num_action
#         self.observation_space = spaces.Box(low=0, high=1, shape=(num_observation,), dtype=np.float32)
#         self.action_space = spaces.Box(low=0, high=1, shape=(num_action,), dtype=np.float32)

#     def reset(self, seed: int = None, options: Optional[Dict[str, Any]] = None):
#         super().reset(seed=seed, options=options)
#         self.state = self.observation_space.sample()
#         return self.state, {}

#     def step(self, action):
#         reward = -np.abs(self.state - action).mean()
#         self.state = self.observation_space.sample()
#         return self.state, reward, False, False, {}
    


class ObservationMatchingEnv(gym.Env):
    def __init__(self, num_options: int = 2):
        self.state = None
        self.num_options = num_options
        self.observation_space = Box(0, 1, shape=(num_options,))
        self.action_space = Box(0, 1, shape=(num_options,))

    def reset(self, seed: int = None, options: Optional[Dict[str, Any]] = None):
        super().reset(seed=seed, options=options)
        self.state = self.observation_space.sample()
        return self.state, {}

    def step(self, action):
        reward = -np.abs(self.state - action).mean()
        self.state = self.observation_space.sample()
        return self.state, reward, False, False, {}



In [1]:
from typing import Dict, Optional
from typing import Any
import numpy as np
import gymnasium as gym

from gymnasium.spaces import Box
from gymnasium.wrappers import TimeLimit
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.util.util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv


class ObservationMatchingEnv(gym.Env):
    def __init__(self, num_observation: int = 23, num_action: int = 42 ):
        self.state = None
        self.num_observation = num_observation
        self.num_action = num_action
        self.observation_space = Box(-np.inf, np.inf, shape=(num_observation,))
        self.action_space = Box(-np.inf, np.inf, shape=(num_action,))

    def reset(self, seed: int = None, options: Optional[Dict[str, Any]] = None):
        super().reset(seed=seed, options=options)
        self.state = self.observation_space.sample()
        return self.state, {}

    def step(self, action):
        reward = -np.abs(self.state - action).mean()
        self.state = self.observation_space.sample()
        return self.state, reward, False, False, {}





In [3]:

gym.register(
    id="custom/ObservationMatchingEnv-v0",
    entry_point=ObservationMatchingEnv,  
    max_episode_steps=500,
)

# Create a single environment for training an expert with SB3
env = gym.make("custom/ObservationMatchingEnv-v0")


In [4]:



# Create a vectorized environment for training with `imitation`

# Option A: use the `make_vec_env` helper function - make sure to pass `post_wrappers=[lambda env, _: RolloutInfoWrapper(env)]`
venv = make_vec_env(
    "custom/ObservationMatchingEnv-v0",
    rng=np.random.default_rng(),
    n_envs=4,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],
)

In [18]:


class ObservationMatchingEnv(gym.Env):
    def __init__(self, observation_dim, action_dim):

        self.current_step = 0

        # Action space
        self.action_dim = action_dim
        self.observation_dim = observation_dim
        self.action_space = Box(low=-1e6, high=1e6, shape=(self.action_dim,), dtype=np.float32)
        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(self.observation_dim,), dtype=np.float32)


    def reset(self, seed: int = None, options: Optional[Dict[str, Any]] = None):
        self.state = self.observation_space.sample()
        return self.state, {}

    def step(self, action):
        print(self.state)
        print(action)
        reward = -np.abs(self.state - action).mean()
        self.state = self.observation_space.sample()
        return self.state, reward, False, False, {}
# Register the environment with Gym
# gym.register(
#     id="ObservationMatchingEnv-v0",
#     entry_point=ObservationMatchingEnv(S,A),  # Replace 'your_module_path' with the actual module path
# )

# # Create an instance of the environment
# env = gym.make("ObservationMatchingEnv-v0")


In [10]:
from typing import Dict, Optional, Any
import numpy as np
import gym
from gym import spaces
import torch

from gymnasium.spaces.box import Box
class CustomEnv(gym.Env):
    def __init__(self, S, A, R):
        self.S = torch.tensor(S, dtype=torch.float32)  # Convert S to torch tensor
        self.A = torch.tensor(A, dtype=torch.float32)  # Convert A to torch tensor
        self.R = torch.tensor(R, dtype=torch.float32) if R is not None else None  # Convert R to torch tensor if provided
        self.num_samples = self.S.shape[0]
        self.current_step = 0
        
        # Action space: assume discrete actions for simplicity
        action_dim = self.A.shape[1]
        self.observation_dim = self.S.shape[1]
        self.action_space = Box(low=-1e6, high=1e6, shape=(action_dim,), dtype=np.float32)
        
        # Observation space: separate S and Y with specific labels
        # obs_dim_S = self.S.shape[1]
        # observation_space = {}
        # for i in range(obs_dim_S):
        #     observation_space[f'S_{i+1}'] = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32)
        
        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(self.observation_dim,), dtype=np.float32)
        
    # def reset(self, seed: int = None):
    #     self.current_step = 0
    #     return self._get_observation()


    def reset(self, seed: int = None, options: Optional[Dict[str, Any]] = None):
        super().reset(seed=seed, options=options)
        self.state = self.observation_space.sample()
        return self.state, {}
    


    def step(self, action):
        print(self.state)
        print(action)
        reward = -np.abs(self.state - action).mean()
        self.state = self.observation_space.sample()
        return self.state, reward, False, False, {}
    

    def step(self, action):
        # Apply action and get next state and reward
        #next_state = self._get_observation()
        self.state = self.observation_space.sample()
        reward = self._calculate_reward(action)
        #done = (self.current_step == self.num_samples - 1)
        #self.current_step = (self.current_step + 1) % self.num_samples
        return self.state, reward, False, False, {}
    
    def _get_observation(self):
        obs = {}
        for i in range(self.S.shape[1]):
            obs[f'S_{i+1}'] = np.array([self.S[self.current_step, i]])
        return obs
    
    def _calculate_reward(self, action):
        # Example: using a reward matrix R based on expected Y values
        if self.R is not None:
            ideal_y = self.R[action]  # Ideal Y value for the chosen action
            actual_y = self.S[self.current_step]  # Actual Y value
            reward = -torch.norm(actual_y - ideal_y)  # Negative L2 norm difference as reward
        else:
            # Default reward function (replace with your own logic)
            reward = torch.sum(self.A[self.current_step] * action)
        
        return reward

In [11]:
from gymnasium.wrappers import TimeLimit
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.vec_env import DummyVecEnv
import numpy as np

# 假設 S 和 A 是你的狀態和動作矩陣
S = np.random.rand(10000, 9)
A = np.random.rand(1000, 45)
env = CustomEnv(S, A,R=None)

env = TimeLimit(env, max_episode_steps=500)

# Create a vectorized environment for training with `imitation`


# Option A: use a helper function to create multiple environments
def _make_env(s=S,a=A,R=None):
    """Helper function to create a single environment. Put any logic here, but make sure to return a RolloutInfoWrapper."""
    _env = CustomEnv(s,a,R)
    _env = TimeLimit(_env, max_episode_steps=500)
    _env = RolloutInfoWrapper(_env)
    return _env




In [12]:
venv = DummyVecEnv([_make_env for _ in range(4)])

In [13]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from gymnasium.wrappers import TimeLimit

expert = PPO(
    policy=MlpPolicy,
    env=env,
    seed=0,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    n_steps=64,
)

reward, _ = evaluate_policy(expert, env, 10)
print(f"Reward before training: {reward}")


# Note: if you followed step 2a, i.e. registered the environment, you can use the environment name directly

# expert = PPO(
#     policy=MlpPolicy,
#     env="custom/ObservationMatching-v0",
#     seed=0,
#     batch_size=64,
#     ent_coef=0.0,
#     learning_rate=0.0003,
#     n_epochs=10,
#     n_steps=64,
# )
expert.learn(10_000)  # Note: set to 100000 to train a proficient expert
reward, _ = evaluate_policy(expert, expert.get_env(), 10)
print(f"Expert reward: {reward}")

rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    venv,
    rollout.make_sample_until(min_timesteps=None, min_episodes=50),
    rng=rng,
)
transitions = rollout.flatten_trajectories(rollouts)

from imitation.algorithms import bc

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

reward_before_training, _ = evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward before training: {reward_before_training}")

bc_trainer.train(n_epochs=1)
reward_after_training, _ = evaluate_policy( .policy, env, 10)
print(f"Reward after training: {reward_after_training}")

Reward before training: 0.16735349398804827
Reward before training: 0.16735349398804827
Expert reward: 1910.5735375
Reward before training: 0.04479575853911229


0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.0639  |
|    entropy        | 63.9     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 82.5     |
|    loss           | 64       |
|    neglogp        | 64       |
|    prob_true_act  | 1.65e-26 |
|    samples_so_far | 32       |
--------------------------------


480batch [00:01, 291.52batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 500      |
|    ent_loss       | -0.0638  |
|    entropy        | 63.8     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 84.3     |
|    loss           | 63.6     |
|    neglogp        | 63.6     |
|    prob_true_act  | 1.01e-25 |
|    samples_so_far | 16032    |
--------------------------------


812batch [00:02, 272.70batch/s]


Reward after training: 1898.6023386627435


In [1]:
evaluate_policy(bc_trainer.policy, env, 10)

NameError: name 'evaluate_policy' is not defined

In [56]:
expert.policy.load_state_dict(bc_trainer.policy.state_dict())

RuntimeError: Error(s) in loading state_dict for ActorCriticPolicy:
	size mismatch for mlp_extractor.policy_net.0.weight: copying a param with shape torch.Size([32, 9]) from checkpoint, the shape in current model is torch.Size([64, 9]).
	size mismatch for mlp_extractor.policy_net.0.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for mlp_extractor.policy_net.2.weight: copying a param with shape torch.Size([32, 32]) from checkpoint, the shape in current model is torch.Size([64, 64]).
	size mismatch for mlp_extractor.policy_net.2.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for mlp_extractor.value_net.0.weight: copying a param with shape torch.Size([32, 9]) from checkpoint, the shape in current model is torch.Size([64, 9]).
	size mismatch for mlp_extractor.value_net.0.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for mlp_extractor.value_net.2.weight: copying a param with shape torch.Size([32, 32]) from checkpoint, the shape in current model is torch.Size([64, 64]).
	size mismatch for mlp_extractor.value_net.2.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for action_net.weight: copying a param with shape torch.Size([45, 32]) from checkpoint, the shape in current model is torch.Size([45, 64]).
	size mismatch for value_net.weight: copying a param with shape torch.Size([1, 32]) from checkpoint, the shape in current model is torch.Size([1, 64]).

Reward before training: 0.07935524935601279


0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.0639  |
|    entropy        | 63.9     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 82.5     |
|    loss           | 64.8     |
|    neglogp        | 64.9     |
|    prob_true_act  | 2.64e-26 |
|    samples_so_far | 32       |
--------------------------------


475batch [00:01, 273.62batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 500      |
|    ent_loss       | -0.0638  |
|    entropy        | 63.8     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 84.5     |
|    loss           | 62.6     |
|    neglogp        | 62.7     |
|    prob_true_act  | 1.72e-25 |
|    samples_so_far | 16032    |
--------------------------------


812batch [00:02, 271.28batch/s]


Reward after training: 1785.1691998738795


In [51]:
bc_trainer

FeedForward32Policy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=9, out_features=32, bias=True)
      (1): Tanh()
      (2): Linear(in_features=32, out_features=32, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=9, out_features=32, bias=True)
      (1): Tanh()
      (2): Linear(in_features=32, out_features=32, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=32, out_features=45, bias=True)
  (value_net): Linear(in_features=32, out_features=1, bias=True)
)

In [45]:
from gymnasium.wrappers import TimeLimit
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.util.util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

# Create a single environment for training an expert with SB3
#env = gym.make("custom/ObservationMatching-v0")


# Create a vectorized environment for training with `imitation`

# Option A: use the `make_vec_env` helper function - make sure to pass `post_wrappers=[lambda env, _: RolloutInfoWrapper(env)]`
venv = make_vec_env(
    env,
    rng=np.random.default_rng(),
    n_envs=4,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],
)

AssertionError: 