In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import gym  
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
import pickle

In [2]:
seed=42
np.random.seed(seed)

### Prepare data

In [3]:
data_path="expert_data/Pendulum-v1_10_-130.pkl"

In [4]:
with open(data_path, "rb") as f:
    data_good = pickle.load(f)
print('expert data loaded')

data_good=data_good[:20]

good_obs=[]
good_acts=[] 
for traj in data_good: 
    s,a,r=traj   
    good_obs.append(s)
    good_acts.append(a) 

states=np.vstack(good_obs)
actions=np.vstack(good_acts)
print('X:',states.shape,' y:', actions.shape)

expert data loaded
X: (2000, 3)  y: (2000, 1)


In [5]:
action_dim=actions.shape[1]
state_dim=states.shape[1]
print(state_dim, action_dim)

3 1


### Train Model

In [6]:
#Not very good, so we will use randomforestregressor instead
# model=LinearRegression()
# model.fit(states, actions)
# score=model.score(states, actions)
# print('score:', score) #score: 0.2268964998032269

In [7]:
model=RandomForestRegressor()
model.fit(states, actions)
score=model.score(states, actions)
print('score:', score)
#Better, we will use this model as bc.

score: 0.9800361995773669


### Inference

In [8]:
env_name='Pendulum-v1'
env = gym.make(env_name)
env.action_space

Box(-2.0, 2.0, (1,), float32)

In [9]:
def play_an_episode(env_name, model, render=False, max_step=500):
    if render:
        env=gym.make(env_name, render_mode='human')
    else:
        env = gym.make(env_name)
    total_reward=0  
    state,info=env.reset() 
    for i in range(max_step):
        if render: env.render()
        action = model.predict([state])[0] 
        next_state,reward,done,trunc,info=env.step([action])
        total_reward+=reward
        state=next_state
        if done or trunc: 
            break 
    
    env.close()
    return total_reward

### Single rollout with visualization

In [10]:
r=play_an_episode(env_name, model, render=True)
print('reward:', r)

reward: -801.2741738335666


### Mean reward over rollouts

In [11]:
scores=[]
n_trajectory=20
for i in range(n_trajectory):
    rewards=play_an_episode(env_name, model)
    print(f'episode #{i} reward: {rewards:0.2f}')
    scores.append(rewards)

print(f'\n score: {np.mean(scores):0.2f} +- {np.std(scores):0.2f}')

episode #0 reward: -871.44
episode #1 reward: -823.56
episode #2 reward: -1052.60
episode #3 reward: -1112.49
episode #4 reward: -1032.93
episode #5 reward: -808.81
episode #6 reward: -754.43
episode #7 reward: -766.68
episode #8 reward: -879.57
episode #9 reward: -868.32
episode #10 reward: -853.92
episode #11 reward: -1034.77
episode #12 reward: -120.58
episode #13 reward: -1039.09
episode #14 reward: -1053.02
episode #15 reward: -1124.66
episode #16 reward: -288.57
episode #17 reward: -121.67
episode #18 reward: -520.36
episode #19 reward: -894.79

 score: -801.11 +- 299.22
