In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pickle 
from stable_baselines3 import SAC
from stable_baselines3.ppo import MlpPolicy

import gym
from stable_baselines3.common.policies  import ActorCriticPolicy, ActorCriticCnnPolicy
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np

from stable_baselines3.common.env_util import make_vec_env 
import pickle

  from urllib3.contrib.pyopenssl import orig_util_SSLContext as SSLContext


### Using SAC Trained Model
* downloaded from https://huggingface.co/sb3

In [3]:
def collect_trajectory(env, policy_net, n_trajectory=20, epsilon=0.0):
    trajectories=[] 
    scores=[]
    for episode in range(n_trajectory):
        state,info = env.reset()
        score = 0 
        states=[]
        actions=[]
        rewards=[]
        while True:
            if epsilon==0:
                action, _states = policy_net.predict(state, deterministic=True)
            else:
                if np.random.random() > (1-epsilon):
                    action=env.action_space.sample() 
                else:
                    action, _states = policy_net.predict(state, deterministic=True)
            
            next_state, reward, done,s, _ = env.step(action)
            score+=reward
             
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            state = next_state 
            if done or s:  
                scores.append(score)
                break
        
        states=np.vstack(states)
        actions=np.vstack(actions)
        rewards=np.vstack(rewards)
        trajectories.append((states, actions, rewards))  
        if episode % 1 == 0:
            print('{} episode score is {:.2f}'.format(episode, score))
    env.close()
    return scores, trajectories

### Ant

In [4]:
savepath= "trained_models/sb3_sac_Ant-v3.zip"
model=SAC.load(savepath, print_system_info=False)

In [5]:
env_name='Ant-v3' 
env_t = gym.make(env_name)

In [6]:
print('collecting trajectories ...')
n_trajectory=10
scores, trajectories = collect_trajectory(env_t, model, n_trajectory)
mean=np.mean(scores)
print('mean score:', mean)

collecting trajectories ...
0 episode score is 5246.79
1 episode score is 3756.22
2 episode score is 4987.11
3 episode score is 5392.41
4 episode score is 5097.51
5 episode score is 5185.53
6 episode score is 5309.34
7 episode score is 5258.45
8 episode score is 5468.29
9 episode score is 4914.24
mean score: 5061.589123630422


In [7]:
filename=f"expert_data/{env_name}_{n_trajectory}_{int(mean)}.pkl"
print('saving ',filename)

with open(filename, 'wb') as f:
    pickle.dump(trajectories, f)
print('trajectories saved.')

saving  expert_data/Ant-v3_10_5061.pkl
trajectories saved.


### Halfcheetah

In [7]:
env_name='HalfCheetah-v3' 
env_t = gym.make(env_name)

savepath= "trained_models/sb3_sac_HalfCheetah-v3.zip"
model=SAC.load(savepath, print_system_info=False)

In [8]:
print('collecting trajectories ...')
n_trajectory=50
scores, trajectories = collect_trajectory(env_t, model, n_trajectory)
mean=np.mean(scores)
print('mean score:', mean)

collecting trajectories ...


0 episode score is 9529.82
1 episode score is 9443.23
2 episode score is 9528.88
3 episode score is 9465.52
4 episode score is 9413.04
5 episode score is 9401.77
6 episode score is 9578.25
7 episode score is 9475.94
8 episode score is 9473.76
9 episode score is 9459.59
10 episode score is 9657.36
11 episode score is 9571.92
12 episode score is 9525.01
13 episode score is 9504.80
14 episode score is 9738.49
15 episode score is 9489.50
16 episode score is 9603.78
17 episode score is 9487.93
18 episode score is 9312.35
19 episode score is 9559.39
20 episode score is 9391.63
21 episode score is 9451.26
22 episode score is 9502.53
23 episode score is 9419.64
24 episode score is 9478.82
25 episode score is 9433.41
26 episode score is 9500.71
27 episode score is 9497.89
28 episode score is 9557.21
29 episode score is 9482.79
30 episode score is 9500.97
31 episode score is 9599.10
32 episode score is 9564.33
33 episode score is 9541.36
34 episode score is 9460.36
35 episode score is 9524.60
36

In [9]:
filename=f"expert_data/{env_name}_{n_trajectory}_{int(mean)}.pkl"
print('saving ',filename)

with open(filename, 'wb') as f:
    pickle.dump(trajectories, f)
print('trajectories saved.')

saving  expert_data/HalfCheetah-v3_50_9520.pkl
trajectories saved.


### Humanoid

In [4]:
env_name='Humanoid-v3' 
env_t = gym.make(env_name)

savepath= "trained_models/sb3_sac_Humanoid-v3.zip"
model=SAC.load(savepath, print_system_info=False)

In [5]:
print('collecting trajectories ...')
n_trajectory=50
scores, trajectories = collect_trajectory(env_t, model, n_trajectory)
mean=np.mean(scores)
print('mean score:', mean)

collecting trajectories ...
0 episode score is 6260.53
1 episode score is 6256.61
2 episode score is 6253.19
3 episode score is 6228.46
4 episode score is 6252.08
5 episode score is 6233.50
6 episode score is 6282.12
7 episode score is 6257.20
8 episode score is 6294.81
9 episode score is 6282.26
10 episode score is 6292.18
11 episode score is 6212.32
12 episode score is 6312.45
13 episode score is 6237.54
14 episode score is 6282.81
15 episode score is 6272.98
16 episode score is 6277.93
17 episode score is 6240.83
18 episode score is 6255.95
19 episode score is 6274.40
20 episode score is 6246.75
21 episode score is 6226.93
22 episode score is 6235.50
23 episode score is 6265.73
24 episode score is 6267.14
25 episode score is 6255.98
26 episode score is 6256.06
27 episode score is 6229.69
28 episode score is 6296.29
29 episode score is 6235.81
30 episode score is 6266.86
31 episode score is 6267.82
32 episode score is 6225.71
33 episode score is 6273.90
34 episode score is 6281.40
35

In [6]:
filename=f"expert_data/{env_name}_{n_trajectory}_{int(mean)}.pkl"
print('saving ',filename)

with open(filename, 'wb') as f:
    pickle.dump(trajectories, f)
print('trajectories saved.')

saving  expert_data/Humanoid-v3_50_6261.pkl
trajectories saved.
