# Import dependencies

In [51]:
import gym
from tensorforce.agents import Agent
from tensorforce.execution import Runner
from tensorforce.environments import Environment

# Test out Environment

In [7]:
env_name = 'BipedalWalker-v3'
max_timesteps = 1000
env = gym.make(
    env_name,
    hardcore=False, 
    render_mode='human'
) # hardcode=True for more harder field

In [None]:
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward
    print(f"Episode: {episode}, Score: {score}")
#env.close()

In [11]:
num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
num_states, num_actions

(24, 4)

In [5]:
env.observation_space

Box([-3.1415927 -5.        -5.        -5.        -3.1415927 -5.
 -3.1415927 -5.        -0.        -3.1415927 -5.        -3.1415927
 -5.        -0.        -1.        -1.        -1.        -1.
 -1.        -1.        -1.        -1.        -1.        -1.       ], [3.1415927 5.        5.        5.        3.1415927 5.        3.1415927
 5.        5.        3.1415927 5.        3.1415927 5.        5.
 1.        1.        1.        1.        1.        1.        1.
 1.        1.        1.       ], (24,), float32)

In [6]:
env.observation_space.sample()

array([-3.042484  , -3.0323431 ,  1.0615405 , -1.5031806 ,  0.7808833 ,
        0.31911507, -1.0491439 ,  1.6969076 ,  4.001736  ,  0.07369177,
        4.5867047 ,  0.86530906, -3.936124  ,  4.3732085 , -0.7828703 ,
       -0.6893783 ,  0.552655  , -0.48335713,  0.50244004, -0.61910814,
        0.00683067, -0.14623961,  0.9170417 ,  0.33065104], dtype=float32)

In [8]:
env.action_space

Box(-1.0, 1.0, (4,), float32)

In [9]:
env.action_space.sample()

array([ 0.91150147,  0.12956218, -0.0660586 , -0.42389414], dtype=float32)

# Build the Agent

## Proximal Policy Optimizatio

In [4]:
from tensorforce.agents import PPOAgent

In [14]:
agent = PPOAgent(
    states=dict(type='float', shape=(num_states,), min_value=-5.0, max_value=5.0),
    actions=dict(type='float', shape=(num_actions,), min_value=-1.0, max_value=1.0),
    network=[
        dict(type='dense', size=64, activation='tanh'),
        dict(type='dense', size=64, activation='tanh')
    ],
    batch_size=32,
    max_episode_timesteps=2000
)
agent.initialize()

In [59]:
agent_name = "ppo_bipedal_walker"

## Deep Deterministic Policy Gradient

In [5]:
from tensorforce.agents import DPGAgent

In [20]:
agent = DPGAgent(
    states=dict(type='float', shape=(num_states,), min_value=-5.0, max_value=5.0),
    actions=dict(type='float', shape=(num_actions,), min_value=-1.0, max_value=1.0),
    network=[
        dict(type='dense', size=64, activation='tanh'),
        dict(type='dense', size=64, activation='tanh')
    ],
    batch_size=32,
    memory=2000,
)
agent.initialize()

In [56]:
agent_name = "dpg_bipedal_walker"

## Trust Region Policy Optimizer

In [37]:
from tensorforce.agents import TRPOAgent

In [44]:
agent = TRPOAgent(
    states=dict(type='float', shape=(num_states,), min_value=-5.0, max_value=5.0),
    actions=dict(type='float', shape=(num_actions,), min_value=-1.0, max_value=1.0),
    network=[
        dict(type='dense', size=64, activation='tanh'),
        dict(type='dense', size=64, activation='tanh')
    ],
    batch_size=32,
    memory=70000,
    max_episode_timesteps=2000
)
agent.initialize()

In [45]:
agent_name = 'trpo_bipedal_walker'

# Train Model

In [46]:
env = gym.make(env_name, hardcore=False) 

In [47]:
env = gym.make(env_name, hardcore=False, render_mode='human')

In [16]:
#runner = Runner(agent=agent, environment=env, max_episode_timesteps=2000)
#runner.run(num_episodes=100)

In [48]:
train_episodes = 500
for episode in range(1, train_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.observe(reward=reward, terminal=done)
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')
    if episode % 50 == 0:
        agent.save(directory='saved_models', filename=f"{agent_name}_{episode}", format='hdf5')

Episode 1: Total reward -119.48114738231541
Episode 2: Total reward -115.8765998029386
Episode 3: Total reward -95.50255156400267
Episode 4: Total reward -25.831186957031612
Episode 5: Total reward -25.285421124144143
Episode 6: Total reward -110.86824906176139
Episode 7: Total reward -94.9889597266938
Episode 8: Total reward -95.22831599149264
Episode 9: Total reward -119.53177379675613
Episode 10: Total reward -111.28975444382274
Episode 11: Total reward -93.86788546850102
Episode 12: Total reward -28.486527646287346
Episode 13: Total reward -95.17814144362588
Episode 14: Total reward -101.4793435432689
Episode 15: Total reward -96.64204278482683
Episode 16: Total reward -96.8455178333111
Episode 17: Total reward -96.82380969696966
Episode 18: Total reward -19.993985942689264
Episode 19: Total reward -22.836997583341834
Episode 20: Total reward -96.46227701591685
Episode 21: Total reward -95.29236262957511
Episode 22: Total reward -24.477241538281092
Episode 23: Total reward -96.1020

# Save and load the model

In [9]:
agent.save(directory='saved_models', filename='ppo_bipedal_walker_500')

'saved_models\\ppo_bipedal_walker_500-2'

In [48]:
agent.close()

In [60]:
agent = Agent.load(directory='saved_models', filename=f'{agent_name}_250')

# Test the model

In [24]:
env = gym.make(env_name, hardcore=False, render_mode='human')

In [None]:
test_episodes = 3
for episode in range(1, test_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0
    while not done:
        action = agent.act(obs, independent=True)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        episode_reward += reward
        obs = next_obs

        
    print(f'Episode {episode}: Total reward {episode_reward}')

In [33]:
env.close()

# Trying out new action method

In [63]:
import numpy as np

In [64]:
action_lst = []
for i in [-1, 0, 1]:
    for j in [-1, 0, 1]:
        for k in [-1, 0, 1]:
            for l in [-1, 0, 1]:
                action_lst.append(np.array([i, j, k, l]))


In [66]:
num_discrete_actions = len(action_lst)

## Building the agent

### DQN Agent

In [69]:
from tensorforce.agents import DQNAgent

In [72]:
agent = DQNAgent(
    states=dict(type='float', shape=(num_states,), min_value=-5.0, max_value=5.0),
    actions=dict(type='int', num_values=num_discrete_actions),
    network=[
        dict(type='dense', size=64, activation='tanh'),
        dict(type='dense', size=64, activation='tanh')
    ],
    batch_size=32,
    max_episode_timesteps=2000,
    memory=2000
)
agent.initialize()

In [75]:
agent_name = "dqn_bipedal_walker"

### PPO Agent

In [91]:
from tensorforce.agents import PPOAgent

In [92]:
agent = PPOAgent(
    states=dict(type='float', shape=(num_states,), min_value=-5.0, max_value=5.0),
    actions=dict(type='int', num_values=num_discrete_actions),
    network=[
        dict(type='dense', size=128, activation='tanh'),
        dict(type='dense', size=128, activation='tanh'),
        dict(type='dense', size=128, activation='tanh')
    ],
    batch_size=32,
    max_episode_timesteps=2000
)
agent.initialize()

In [93]:
agent_name = "ppo_bipedal_walker_disc"

## Train and Test the new agent

In [94]:
env = gym.make(env_name, hardcore=False, render_mode='human')

In [95]:
train_episodes = 500
for episode in range(1, train_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action_index = agent.act(obs)
        action = action_lst[action_index]
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.observe(reward=reward, terminal=done)
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')
    if episode % 50 == 0:
        agent.save(directory='saved_models', filename=f"{agent_name}_{episode}", format='hdf5')

Episode 1: Total reward -121.03840205319162
Episode 2: Total reward -122.31448413087115
Episode 3: Total reward -135.8044498032927
Episode 4: Total reward -139.69465966808232
Episode 5: Total reward -120.43042660486265
Episode 6: Total reward -114.37610593854903
Episode 7: Total reward -124.99334760192446
Episode 8: Total reward -99.86300241772085
Episode 9: Total reward -106.08530386997387
Episode 10: Total reward -133.2517218920747
Episode 11: Total reward -103.68425241738682
Episode 12: Total reward -119.50014139907931
Episode 13: Total reward -218.9942951153041
Episode 14: Total reward -106.78710216686315
Episode 15: Total reward -129.1142457258695
Episode 16: Total reward -114.8061080098236
Episode 17: Total reward -102.9378518262996
Episode 18: Total reward -108.19006905036109
Episode 19: Total reward -106.6068578686174
Episode 20: Total reward -117.15735161819494
Episode 21: Total reward -102.82360372487331
Episode 22: Total reward -106.26656314820546
Episode 23: Total reward -1

In [96]:
train_episodes = 4500
for episode in range(501, train_episodes + 501):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action_index = agent.act(obs)
        action = action_lst[action_index]
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.observe(reward=reward, terminal=done)
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')
    if episode % 50 == 0:
        agent.save(directory='saved_models', filename=f"{agent_name}_{episode}", format='hdf5')

Episode 501: Total reward -84.80512623193088
Episode 502: Total reward -111.80144708919711
Episode 503: Total reward -81.8885755212322
Episode 504: Total reward -89.14106943044834
Episode 505: Total reward -112.72179488877146
Episode 506: Total reward -93.20774698873869
Episode 507: Total reward -89.21478597669224
Episode 508: Total reward -86.73062937059419
Episode 509: Total reward -86.87118265781889
Episode 510: Total reward -95.25077673147256
Episode 511: Total reward -87.09966521734276
Episode 512: Total reward -89.43126222459875
Episode 513: Total reward -86.50050831139848
Episode 514: Total reward -93.2055811555357
Episode 515: Total reward -113.81984605588268
Episode 516: Total reward -87.85920351430863
Episode 517: Total reward -80.63695733760247
Episode 518: Total reward -89.37743591230512
Episode 519: Total reward -84.37425925599004
Episode 520: Total reward -80.65231508113388
Episode 521: Total reward -85.16664226355418
Episode 522: Total reward -83.29646091639599
Episode 5

KeyboardInterrupt: 

In [90]:
test_episodes = 3
for episode in range(1, test_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action_index = agent.act(obs, independent=True)
        action = action_lst[action_index]
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')

Episode 1: Total reward -118.57975477016217
Episode 2: Total reward -118.97372378463606
Episode 3: Total reward -118.79407228523118


In [89]:
agent = Agent.load(directory='saved_models', filename=f'{agent_name}_300')