In [1]:
import numpy as np  
from matplotlib import pyplot as plt
import gym
import pickle 
 
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier

### Prepare data

In [2]:
# data_path = "expert_data/human_demos_4_-111.0.pkl"
data_path = "expert_data/human_demos_10_-115.0.pkl"
# data_path = "expert_data/human_demos_20_-113.0.pkl"

with open(data_path, 'rb') as f:
    trajs = pickle.load(f)

print(f"Number of trajectories: {len(trajs)}") 

Number of trajectories: 10


In [3]:
states=[]
actions=[]
for traj in trajs:
    for state,action in traj:
        states.append(state)
        actions.append(action)

states=np.array(states)
actions=np.array(actions)

states.shape,actions.shape

((1138, 2), (1138,))

In [4]:
num_classes = np.max(actions)+1 
num_classes

3

### Training model 

In [5]:
# Logistice Regression is a linear model that does not work well with non-linear data
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
# model.fit(states, actions)
# model.score(states, actions)  #0.7504393673110721

In [6]:
#we will use random forest instead.
model = RandomForestClassifier(n_estimators=100)
model.fit(states, actions)
model.score(states, actions)

1.0

### Inference

In [7]:
env_name='MountainCar-v0'
env = gym.make(env_name)
env.action_space

Discrete(3)

In [8]:
def play_an_episode(env_name, model, render=False, max_step=500):
     
    if render:
        env=gym.make(env_name, render_mode='human')
    else:
        env = gym.make(env_name)
    total_reward=0  
    state,_=env.reset() 
    for i in range(max_step):
        if render: env.render()
        action=model.predict([state])[0]
        next_state,reward,done,trunc,info=env.step( action )
        total_reward+=reward
        state=next_state
        if done or trunc: break 
            
    env.close()
    return total_reward

### Single rollout (pop-up window)

In [9]:
r=play_an_episode(env_name, model, render=True)
r

-159.0

### mean reward over rollouts

In [10]:
rewards=[]
n=20
for i in range(n):
    r=play_an_episode(env_name, model)
    rewards.append(r)
    print(f'episode {i}, reward: {r}')

r_mean=np.mean(rewards)
r_std=np.std(rewards)

print(f'\nreward: {r_mean:0.2f} +- {r_std:0.2f}')

episode 0, reward: -90.0
episode 1, reward: -161.0
episode 2, reward: -83.0
episode 3, reward: -166.0
episode 4, reward: -143.0
episode 5, reward: -145.0
episode 6, reward: -177.0
episode 7, reward: -183.0
episode 8, reward: -146.0
episode 9, reward: -166.0
episode 10, reward: -84.0
episode 11, reward: -85.0
episode 12, reward: -182.0
episode 13, reward: -159.0
episode 14, reward: -91.0
episode 15, reward: -166.0
episode 16, reward: -167.0
episode 17, reward: -97.0
episode 18, reward: -92.0
episode 19, reward: -156.0

reward: -136.95 +- 36.84
