In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import pdb
import gym
import copy
import functools
import matplotlib.pyplot as plt
import time
from torch.utils.tensorboard import SummaryWriter

import torch
import numpy as np
from collections import deque

from Config import Config
from Models import ActorCritic
from PPO import PPOPixel
from Networks import cnn_head_model, actor_model, critic_model, head_model
from Memory import Memory
# from baselines.common.cmd_util import make_env
from baselines.common.atari_wrappers import wrap_deepmind, make_atari


env_id = "BreakoutNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=True, scale=False)

config = Config(env)

config.update_every = 500
config.num_learn = 4
config.win_condition = 230
config.n_episodes = 2000000
config.max_t = 700
config.lr = 2.5e-4
config.hidden_size = 512

config.Memory = Memory
config.Model = ActorCritic
config.head_model = functools.partial(cnn_head_model, config)
config.actor_model = functools.partial(actor_model, config)
config.critic_model = functools.partial(critic_model, config)

experiment_name = f"{env_id}____{int(time.time())}"
config.tb_logger = SummaryWriter(f"runs/{experiment_name}")


Running experiment with device: cpu


In [None]:
def train(config):
    env = copy.deepcopy(config.env)
    steps = 0
    scores_deque = deque(maxlen=100)
    scores = []
    average_scores = []
    max_score = -np.Inf
    global_step = 0

    agent = PPOPixel(config)

#     for i_episode in range(1, 10000000):
    while global_step < 100000000:
        state = env.reset()
        score = 0
        for t in range(config.update_every):
            steps += 1
            global_step += 1

            action, log_prob, value, entr = agent.act(state)
            next_state, reward, done, info = env.step(action)
            agent.add_to_mem(state, action, reward, log_prob, done)

            # Update 
            state = next_state
            score += reward

            if steps >= config.update_every:
                value_loss, pg_loss, approx_kl = agent.learn(config.num_learn)
                agent.mem.clear()
                steps = 0
            

        # Book Keeping
        scores_deque.append(score)
        scores.append(score)
        average_scores.append(np.mean(scores_deque))
        
        config.tb_logger.add_scalar("charts/episode_reward", score, global_step)
        config.tb_logger.add_scalar("losses/value_loss", value_loss.item(), global_step)
        config.tb_logger.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
        config.tb_logger.add_scalar("losses/approx_kl", approx_kl.item(), global_step)

        print("Global Step: {}	Average Score: {:.2f}".format(global_step, np.mean(scores_deque)))   

        if np.mean(scores_deque) > config.win_condition:
            print("\nEnvironment Solved!")
            break

        

    return scores, average_scores

scores, average_scores = train(config)

\Global Step: 500	Average Score: 1.00
\Global Step: 1000	Average Score: 0.50
\Global Step: 1500	Average Score: 2.00
\Global Step: 2000	Average Score: 1.75
\Global Step: 2500	Average Score: 1.60
\Global Step: 3000	Average Score: 1.33
\Global Step: 3500	Average Score: 1.14
\Global Step: 4000	Average Score: 1.75
\Global Step: 4500	Average Score: 1.67
\Global Step: 5000	Average Score: 1.60
\Global Step: 5500	Average Score: 1.91
\Global Step: 6000	Average Score: 2.08
\Global Step: 6500	Average Score: 2.38
\Global Step: 7000	Average Score: 2.21
\Global Step: 7500	Average Score: 2.27
\Global Step: 8000	Average Score: 2.44
\Global Step: 8500	Average Score: 2.29
\Global Step: 9000	Average Score: 2.33
\Global Step: 9500	Average Score: 2.37
\Global Step: 10000	Average Score: 2.40
\Global Step: 10500	Average Score: 2.43
\Global Step: 11000	Average Score: 2.36
\Global Step: 11500	Average Score: 2.26
\Global Step: 12000	Average Score: 2.25
\Global Step: 12500	Average Score: 2.16
\Global Step: 13000	

\Global Step: 103500	Average Score: 1.78
\Global Step: 104000	Average Score: 1.79
\Global Step: 104500	Average Score: 1.80
\Global Step: 105000	Average Score: 1.82
\Global Step: 105500	Average Score: 1.82
\Global Step: 106000	Average Score: 1.83
\Global Step: 106500	Average Score: 1.85
\Global Step: 107000	Average Score: 1.87
\Global Step: 107500	Average Score: 1.85
\Global Step: 108000	Average Score: 1.86
\Global Step: 108500	Average Score: 1.82
\Global Step: 109000	Average Score: 1.82
\Global Step: 109500	Average Score: 1.82
\Global Step: 110000	Average Score: 1.84
\Global Step: 110500	Average Score: 1.83
\Global Step: 111000	Average Score: 1.85
\Global Step: 111500	Average Score: 1.80
\Global Step: 112000	Average Score: 1.79
\Global Step: 112500	Average Score: 1.74
\Global Step: 113000	Average Score: 1.71
\Global Step: 113500	Average Score: 1.73
\Global Step: 114000	Average Score: 1.74
\Global Step: 114500	Average Score: 1.71
\Global Step: 115000	Average Score: 1.73
\Global Step: 11

\Global Step: 203500	Average Score: 1.53
\Global Step: 204000	Average Score: 1.55
\Global Step: 204500	Average Score: 1.55
\Global Step: 205000	Average Score: 1.53
\Global Step: 205500	Average Score: 1.59
\Global Step: 206000	Average Score: 1.59
\Global Step: 206500	Average Score: 1.60
\Global Step: 207000	Average Score: 1.64
\Global Step: 207500	Average Score: 1.63
\Global Step: 208000	Average Score: 1.63
\Global Step: 208500	Average Score: 1.62
\Global Step: 209000	Average Score: 1.63
\Global Step: 209500	Average Score: 1.58
\Global Step: 210000	Average Score: 1.58
\Global Step: 210500	Average Score: 1.58
\Global Step: 211000	Average Score: 1.59
\Global Step: 211500	Average Score: 1.59
\Global Step: 212000	Average Score: 1.57
\Global Step: 212500	Average Score: 1.55
\Global Step: 213000	Average Score: 1.52
\Global Step: 213500	Average Score: 1.55
\Global Step: 214000	Average Score: 1.57
\Global Step: 214500	Average Score: 1.57
\Global Step: 215000	Average Score: 1.60
\Global Step: 21