In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import pdb
import gym
import copy
import functools
import matplotlib.pyplot as plt
import time
from torch.utils.tensorboard import SummaryWriter

import torch
import numpy as np
from collections import deque

from Config import Config
from Models import ActorCritic
from PPO import PPOPixel
from Networks import cnn_head_model, actor_model, critic_model, head_model
from Memory import Memory
# from baselines.common.cmd_util import make_env
from baselines.common.atari_wrappers import wrap_deepmind, make_atari


env_id = "BreakoutNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=True, scale=False)

config = Config(env)

config.update_every = 500
config.num_learn = 4
config.win_condition = 230
config.n_episodes = 2000000
config.max_t = 700
config.lr = 2.5e-4
config.hidden_size = 512

config.Memory = Memory
config.Model = ActorCritic
config.head_model = functools.partial(cnn_head_model, config)
config.actor_model = functools.partial(actor_model, config)
config.critic_model = functools.partial(critic_model, config)

experiment_name = f"{env_id}____{int(time.time())}"
config.tb_logger = SummaryWriter(f"runs/{experiment_name}")


Running experiment with device: cpu


In [None]:
def train(config):
    env = copy.deepcopy(config.env)
    steps = 0
    scores_deque = deque(maxlen=100)
    scores = []
    average_scores = []
    max_score = -np.Inf
    global_step = 0

    agent = PPOPixel(config)

#     for i_episode in range(1, 10000000):
    while global_step < 100000000:
        state = env.reset()
        score = 0
        value, done = None, None
        
        for t in range(config.update_every):
            steps += 1
            global_step += 1

            action, log_prob, value, entr = agent.act(state)
            next_state, reward, done, info = env.step(action)
            agent.add_to_mem(state, action, reward, log_prob, done)

            # Update 
            state = next_state
            score += reward

            if (info["ale.lives"] == 0 and done):
                state = env.reset()

        # update and learn
        value_loss, pg_loss, approx_kl = agent.learn(config.num_learn, value.item(), done)
        agent.mem.clear()
        steps = 0
            

        # Book Keeping
        scores_deque.append(score)
        scores.append(score)
        average_scores.append(np.mean(scores_deque))
        
        config.tb_logger.add_scalar("charts/episode_reward", score, global_step)
        config.tb_logger.add_scalar("losses/value_loss", value_loss.item(), global_step)
        config.tb_logger.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
        config.tb_logger.add_scalar("losses/approx_kl", approx_kl.item(), global_step)

        print("Global Step: {}	Average Score: {:.2f}".format(global_step, np.mean(scores_deque)))   

        if np.mean(scores_deque) > config.win_condition:
            print("\nEnvironment Solved!")
            break

        

    return scores, average_scores

scores, average_scores = train(config)

Global Step: 500	Average Score: 4.00
Global Step: 1000	Average Score: 4.00
Global Step: 1500	Average Score: 3.67
Global Step: 2000	Average Score: 4.00
Global Step: 2500	Average Score: 4.00
Global Step: 3000	Average Score: 3.83
Global Step: 3500	Average Score: 3.86
Global Step: 4000	Average Score: 3.75
Global Step: 4500	Average Score: 3.44
Global Step: 5000	Average Score: 3.40
Global Step: 5500	Average Score: 3.27
Global Step: 6000	Average Score: 3.42
Global Step: 6500	Average Score: 3.31
Global Step: 7000	Average Score: 3.36
Global Step: 7500	Average Score: 3.40
Global Step: 8000	Average Score: 3.31
Global Step: 8500	Average Score: 3.41
Global Step: 9000	Average Score: 3.44
Global Step: 9500	Average Score: 3.32
Global Step: 10000	Average Score: 3.25
Global Step: 10500	Average Score: 3.24
Global Step: 11000	Average Score: 3.27
Global Step: 11500	Average Score: 3.17
Global Step: 12000	Average Score: 3.17
Global Step: 12500	Average Score: 3.24
Global Step: 13000	Average Score: 3.27
Global

Global Step: 106000	Average Score: 4.46
Global Step: 106500	Average Score: 4.44
Global Step: 107000	Average Score: 4.48
Global Step: 107500	Average Score: 4.51
Global Step: 108000	Average Score: 4.49
Global Step: 108500	Average Score: 4.47
Global Step: 109000	Average Score: 4.47
Global Step: 109500	Average Score: 4.46
Global Step: 110000	Average Score: 4.51
Global Step: 110500	Average Score: 4.55
Global Step: 111000	Average Score: 4.59
Global Step: 111500	Average Score: 4.62
Global Step: 112000	Average Score: 4.68
Global Step: 112500	Average Score: 4.69
Global Step: 113000	Average Score: 4.67
Global Step: 113500	Average Score: 4.70
Global Step: 114000	Average Score: 4.72
Global Step: 114500	Average Score: 4.75
Global Step: 115000	Average Score: 4.72
Global Step: 115500	Average Score: 4.74
Global Step: 116000	Average Score: 4.70
Global Step: 116500	Average Score: 4.67
Global Step: 117000	Average Score: 4.68
Global Step: 117500	Average Score: 4.66
Global Step: 118000	Average Score: 4.65


Global Step: 208500	Average Score: 4.69
Global Step: 209000	Average Score: 4.66
Global Step: 209500	Average Score: 4.65
Global Step: 210000	Average Score: 4.65
Global Step: 210500	Average Score: 4.66
Global Step: 211000	Average Score: 4.68
Global Step: 211500	Average Score: 4.67
Global Step: 212000	Average Score: 4.66
Global Step: 212500	Average Score: 4.65
Global Step: 213000	Average Score: 4.67
Global Step: 213500	Average Score: 4.68
Global Step: 214000	Average Score: 4.64
Global Step: 214500	Average Score: 4.65
Global Step: 215000	Average Score: 4.60
Global Step: 215500	Average Score: 4.60
Global Step: 216000	Average Score: 4.61
Global Step: 216500	Average Score: 4.64
Global Step: 217000	Average Score: 4.61
Global Step: 217500	Average Score: 4.64
Global Step: 218000	Average Score: 4.62
Global Step: 218500	Average Score: 4.63
Global Step: 219000	Average Score: 4.64
Global Step: 219500	Average Score: 4.67
Global Step: 220000	Average Score: 4.70
Global Step: 220500	Average Score: 4.74


Global Step: 311000	Average Score: 7.41
Global Step: 311500	Average Score: 7.42
Global Step: 312000	Average Score: 7.45
Global Step: 312500	Average Score: 7.51
Global Step: 313000	Average Score: 7.49
Global Step: 313500	Average Score: 7.49
Global Step: 314000	Average Score: 7.52
Global Step: 314500	Average Score: 7.51
Global Step: 315000	Average Score: 7.54
Global Step: 315500	Average Score: 7.53
Global Step: 316000	Average Score: 7.55
Global Step: 316500	Average Score: 7.52
Global Step: 317000	Average Score: 7.53
Global Step: 317500	Average Score: 7.53
Global Step: 318000	Average Score: 7.52
Global Step: 318500	Average Score: 7.53
Global Step: 319000	Average Score: 7.50
Global Step: 319500	Average Score: 7.49
Global Step: 320000	Average Score: 7.50
Global Step: 320500	Average Score: 7.50
Global Step: 321000	Average Score: 7.49
Global Step: 321500	Average Score: 7.50
Global Step: 322000	Average Score: 7.49
Global Step: 322500	Average Score: 7.47
Global Step: 323000	Average Score: 7.45


Global Step: 413500	Average Score: 7.71
Global Step: 414000	Average Score: 7.74
Global Step: 414500	Average Score: 7.71
Global Step: 415000	Average Score: 7.73
Global Step: 415500	Average Score: 7.72
Global Step: 416000	Average Score: 7.73
Global Step: 416500	Average Score: 7.74
Global Step: 417000	Average Score: 7.72
Global Step: 417500	Average Score: 7.72
Global Step: 418000	Average Score: 7.73
Global Step: 418500	Average Score: 7.73
Global Step: 419000	Average Score: 7.74
Global Step: 419500	Average Score: 7.73
Global Step: 420000	Average Score: 7.72
Global Step: 420500	Average Score: 7.73
Global Step: 421000	Average Score: 7.71
Global Step: 421500	Average Score: 7.71
Global Step: 422000	Average Score: 7.72
Global Step: 422500	Average Score: 7.73
Global Step: 423000	Average Score: 7.67
Global Step: 423500	Average Score: 7.65
Global Step: 424000	Average Score: 7.63
Global Step: 424500	Average Score: 7.62
Global Step: 425000	Average Score: 7.61
Global Step: 425500	Average Score: 7.59
