# Import Dependencies

In [3]:
import tensorflow as tf
from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
import gym

'2.8.0'

# Test environment

In [57]:
env_name = 'CartPole-v1'
env = gym.make(env_name)

In [20]:
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward
    print(f"Episode: {episode}, Score: {score}")
#env.close()

Episode: 1, Score: 27.0
Episode: 2, Score: 17.0
Episode: 3, Score: 17.0
Episode: 4, Score: 25.0
Episode: 5, Score: 23.0


In [21]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [22]:
env.observation_space.sample()

array([-2.1200185e+00, -1.3063080e+38,  1.7907065e-01, -3.3503796e+38],
      dtype=float32)

In [23]:
env.action_space

Discrete(2)

In [24]:
env.action_space.sample()

1

# Train the model

In [51]:
# https://tensorforce.readthedocs.io/en/latest/agents/ppo.html
# https://tensorforce.readthedocs.io/en/0.2.0/agents_models.html
agent = PPOAgent(
    states=dict(type='float', shape=env.observation_space.shape),
    actions=dict(type='int', num_values=env.action_space.n),
    network=[
        dict(type='dense', size=64),
        dict(type='dense', size=64)
    ],
    batch_size=32,
    max_episode_timesteps=500
)

agent.initialize()



In [58]:
train_episodes = 100
for episode in range(1, train_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.observe(reward=reward, terminal=done)
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')

Episode 1: Total reward 89.0
Episode 2: Total reward 112.0
Episode 3: Total reward 226.0
Episode 4: Total reward 176.0
Episode 5: Total reward 140.0
Episode 6: Total reward 176.0
Episode 7: Total reward 162.0
Episode 8: Total reward 155.0
Episode 9: Total reward 227.0
Episode 10: Total reward 181.0
Episode 11: Total reward 180.0
Episode 12: Total reward 126.0
Episode 13: Total reward 228.0
Episode 14: Total reward 117.0
Episode 15: Total reward 128.0
Episode 16: Total reward 109.0
Episode 17: Total reward 91.0
Episode 18: Total reward 71.0
Episode 19: Total reward 129.0
Episode 20: Total reward 123.0
Episode 21: Total reward 100.0
Episode 22: Total reward 106.0
Episode 23: Total reward 108.0
Episode 24: Total reward 71.0
Episode 25: Total reward 254.0
Episode 26: Total reward 218.0
Episode 27: Total reward 208.0
Episode 28: Total reward 144.0
Episode 29: Total reward 350.0
Episode 30: Total reward 157.0
Episode 31: Total reward 190.0
Episode 32: Total reward 106.0
Episode 33: Total rew

# Save and Reload the model

In [86]:
agent.save(directory='saved_models', filename='ppo_cartpole_2', format='hdf5')

'saved_models\\ppo_cartpole_2.hdf5'

In [97]:
agent.close()

In [101]:
agent = PPOAgent.load(directory='saved_models', filename='ppo_cartpole_2', format='hdf5')



# Evaluate and test the model

In [95]:
env = gym.make(env_name, render_mode='human')

In [102]:
test_episodes = 3
for episode in range(1, test_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs, independent=True)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        episode_reward += reward
        obs = next_obs
        
    print(f'Episode {episode}: Total reward {episode_reward}')

Episode 1: Total reward 500.0
Episode 2: Total reward 500.0
Episode 3: Total reward 500.0


In [92]:
env.close()

# Continue the training

In [81]:
env = gym.make(env_name)

In [82]:
train_episodes = 100
for episode in range(1, train_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.observe(reward=reward, terminal=done)
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')

Episode 1: Total reward 143.0
Episode 2: Total reward 270.0
Episode 3: Total reward 321.0
Episode 4: Total reward 243.0
Episode 5: Total reward 179.0
Episode 6: Total reward 131.0
Episode 7: Total reward 274.0
Episode 8: Total reward 378.0
Episode 9: Total reward 337.0
Episode 10: Total reward 189.0
Episode 11: Total reward 329.0
Episode 12: Total reward 447.0
Episode 13: Total reward 300.0
Episode 14: Total reward 159.0
Episode 15: Total reward 365.0
Episode 16: Total reward 204.0
Episode 17: Total reward 194.0
Episode 18: Total reward 500.0
Episode 19: Total reward 133.0
Episode 20: Total reward 158.0
Episode 21: Total reward 189.0
Episode 22: Total reward 292.0
Episode 23: Total reward 409.0
Episode 24: Total reward 435.0
Episode 25: Total reward 182.0
Episode 26: Total reward 288.0
Episode 27: Total reward 136.0
Episode 28: Total reward 319.0
Episode 29: Total reward 172.0
Episode 30: Total reward 163.0
Episode 31: Total reward 169.0
Episode 32: Total reward 207.0
Episode 33: Total

# Use DQNAgent

In [105]:
from tensorforce.agents import DQNAgent

In [113]:
# https://tensorforce.readthedocs.io/en/latest/agents/dqn.html
# https://tensorforce.readthedocs.io/en/0.2.0/agents_models.html
agent = DQNAgent(
    states=dict(type='float', shape=env.observation_space.shape),
    actions=dict(type='int', num_values=env.action_space.n),
    network=[
        dict(type='dense', size=64),
        dict(type='dense', size=64)
    ],
    batch_size=32,
    max_episode_timesteps=500,
    memory=1000
)
agent.initialize()



In [114]:
env = gym.make(env_name)

In [None]:
train_episodes = 1000
for episode in range(1, train_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.observe(reward=reward, terminal=done)
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')
    if episode % 100 == 0:
        agent.save(directory='saved_models', filename=f'dqn_cartpole_{episode}', format='hdf5')

In [145]:
agent = DQNAgent.load(directory='saved_models', filename='dqn_cartpole_600', format='hdf5')



In [135]:
env = gym.make(env_name, render_mode='human')

In [146]:
test_episodes = 3
for episode in range(1, test_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs, independent=True)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        episode_reward += reward
        obs = next_obs
        
    print(f'Episode {episode}: Total reward {episode_reward}')

Episode 1: Total reward 190.0
Episode 2: Total reward 185.0
Episode 3: Total reward 188.0


In [147]:
env.close()

# Train with TensorBoard

In [150]:
agent = PPOAgent(
    states=dict(type='float', shape=env.observation_space.shape),
    actions=dict(type='int', num_values=env.action_space.n),
    network=[
        dict(type='dense', size=64),
        dict(type='dense', size=64)
    ],
    batch_size=32,
    max_episode_timesteps=500,
    summarizer=dict(
        directory='logs/summaries',
        summaries='all'
    )
)

agent.initialize()



In [151]:
env = gym.make(env_name)

In [152]:
train_episodes = 500
for episode in range(1, train_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.observe(reward=reward, terminal=done)
        episode_reward += reward
        obs = next_obs

    print(f'Episode {episode}: Total reward {episode_reward}')
    if episode % 50 == 0:
        agent.save(directory='saved_models', filename=f'ppo_cartpole_{episode}', format='hdf5')

Episode 1: Total reward 26.0
Episode 2: Total reward 14.0
Episode 3: Total reward 14.0
Episode 4: Total reward 51.0
Episode 5: Total reward 16.0
Episode 6: Total reward 17.0
Episode 7: Total reward 15.0
Episode 8: Total reward 19.0
Episode 9: Total reward 39.0
Episode 10: Total reward 12.0
Episode 11: Total reward 34.0
Episode 12: Total reward 18.0
Episode 13: Total reward 21.0
Episode 14: Total reward 67.0
Episode 15: Total reward 10.0
Episode 16: Total reward 17.0
Episode 17: Total reward 16.0
Episode 18: Total reward 36.0
Episode 19: Total reward 14.0
Episode 20: Total reward 12.0
Episode 21: Total reward 14.0
Episode 22: Total reward 42.0
Episode 23: Total reward 18.0
Episode 24: Total reward 24.0
Episode 25: Total reward 11.0
Episode 26: Total reward 14.0
Episode 27: Total reward 22.0
Episode 28: Total reward 12.0
Episode 29: Total reward 19.0
Episode 30: Total reward 11.0
Episode 31: Total reward 10.0
Episode 32: Total reward 20.0
Episode 33: Total reward 15.0
Episode 34: Total r

In [153]:
env = gym.make(env_name, render_mode='human')

In [189]:
agent = PPOAgent.load(directory='saved_models', filename='ppo_cartpole_400')
# ppo_cartpole_400, 450, 500 are acceptable models



In [191]:
test_episodes = 3
for episode in range(1, test_episodes + 1):
    obs = env.reset()
    obs = obs[0]
    done = False 
    episode_reward = 0

    while not done:
        action = agent.act(obs, independent=True)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        episode_reward += reward
        obs = next_obs
        
    print(f'Episode {episode}: Total reward {episode_reward}')

Episode 1: Total reward 500.0
Episode 2: Total reward 500.0
Episode 3: Total reward 500.0
