In [1]:
import gym
import numpy as np
from ppo import Agent
import wandb

In [2]:
env = gym.make('CartPole-v1')
N = 30
batch_size = 5
n_epochs = 4
alpha = 0.0002
agent = Agent(n_actions = env.action_space.n, 
              batch_size = batch_size, 
              alpha = alpha, 
              n_epochs=n_epochs, 
              input_dims=env.observation_space.shape)
n_games = 200

In [3]:
figure_file = 'plots/cartpole.png'

best_score = env.reward_range[0]
best_score

-inf

In [9]:
score_history = []
learn_iters = 0
avg_score = 0
n_steps = 0
max_time_steps = 500

In [None]:
run = wandb.init(
    entity="ducanh2002add-hanoi-university-of-science-and-technology",
    project="PPO-testing",
    config={
        "learning_rate": alpha,
        "architecture": "PPO",
        "dataset": "Gym CartPole-v1",
        "epochs": n_epochs,
        "batch_size": batch_size,
        "Number of games": n_games,
        "Total timestep to start learning (n_steps)": N,
        "Agent's hidden size1": 256,
        "Agent's hidden_size2": 256,
        "Environment maximum steps": max_time_steps

    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mducanh2002add[0m ([33mducanh2002add-hanoi-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
for i in range(n_games):
    observation = env.reset()[0]
    done = False
    score = 0
    n_steps = 0
    while not done and n_steps < max_time_steps:
        # print(observation[0])
        action, prob, val = agent.choose_action(observation)
        observation_, reward, done, _, _ = env.step(action)
        n_steps += 1
        score += reward

        agent.remember(observation, action, prob, val, reward, done)
        if n_steps % N == 0:
            agent.learn()
            learn_iters += 1
        
        observation = observation_

    score_history.append(score)
    avg_score = np.mean(score_history)

    if avg_score > best_score:
        best_score = avg_score
        agent.save_models()

    print(f"Episode {i:5} | score: {score:5} | Avg_score: {avg_score:7.2f} | Time_steps: {n_steps:7} | Learning_steps: {learn_iters}")
    
    run.log({
        "score" : score,
        "avg_score": avg_score,
        "best_score": best_score
    })

    

  state = torch.tensor([observation], dtype = torch.float).to(self.actor.device)


Saving models
Episode     0 | score:  11.0 | Avg_score:   11.00 | Time_steps:      11 | Learning_steps: 0
Saving models
Episode     1 | score:  13.0 | Avg_score:   12.00 | Time_steps:      13 | Learning_steps: 0
Saving models
Episode     2 | score:  21.0 | Avg_score:   15.00 | Time_steps:      21 | Learning_steps: 0
Episode     3 | score:  15.0 | Avg_score:   15.00 | Time_steps:      15 | Learning_steps: 0
Saving models
Episode     4 | score:  30.0 | Avg_score:   18.00 | Time_steps:      30 | Learning_steps: 1
Episode     5 | score:  15.0 | Avg_score:   17.50 | Time_steps:      15 | Learning_steps: 1
Saving models
Episode     6 | score:  25.0 | Avg_score:   18.57 | Time_steps:      25 | Learning_steps: 1
Episode     7 | score:  17.0 | Avg_score:   18.38 | Time_steps:      17 | Learning_steps: 1
Episode     8 | score:  20.0 | Avg_score:   18.56 | Time_steps:      20 | Learning_steps: 1
Saving models
Episode     9 | score:  28.0 | Avg_score:   19.50 | Time_steps:      28 | Learning_steps

In [15]:
agent.load_models()
agent.actor.eval()
agent.critic.eval()

n_games_eval = 5
max_time_steps = 2000
scores = []
# Evaluate

for i in range(n_games_eval):
    ovservation = env.reset()[0]
    done = False
    score = 0
    n_steps = 0
    while not done and n_steps < max_time_steps:
        n_steps += 1
        action, _, _ = agent.choose_action(ovservation)
        ovservation, reward, done, _, _ = env.step(action)
        score += reward
    scores.append(score)

np.mean(scores)

Loading models


2000.0