Imports

In [1]:
import full_lib

import argparse
from datetime import datetime
import json
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from tensorboardX import SummaryWriter

In [2]:
!pip install gym[atari]

Collecting atari-py~=0.2.0; extra == "atari"
  Downloading atari_py-0.2.6-cp37-cp37m-manylinux1_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.3 MB/s 
Installing collected packages: atari-py
Successfully installed atari-py-0.2.6


Constants

In [3]:
DEFAULT_ENV_NAME = "Qbert-v0"
MEAN_REWARD_BOUND = 2500
TIME_LIMIT = 120 # minutes

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

dict_args = {
    'cuda': True,
    'time_limit': TIME_LIMIT,
    'env': DEFAULT_ENV_NAME,
    'reward': MEAN_REWARD_BOUND,
    'batch_size': BATCH_SIZE,
    'replay_size': REPLAY_SIZE,
    'learning_rate': LEARNING_RATE,
    'sync_target_frames': SYNC_TARGET_FRAMES,
    'replay_start_size': REPLAY_START_SIZE,
    'eplison_decay_last_frame': EPSILON_DECAY_LAST_FRAME,
    'epsilon_start': EPSILON_START,
    'epsilon_final': EPSILON_FINAL,
}

args = Struct(**dict_args)

NameError: name 'Struct' is not defined

In [4]:
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device).long()
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [5]:
args = Struct(**dict_args)
env = full_lib.make_env(args.env)

In [6]:
if __name__ == "__main__":
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda")
#     parser.add_argument("--env", default=DEFAULT_ENV_NAME,
#                         help="Name of the environment, default=" + DEFAULT_ENV_NAME)
#     parser.add_argument("--reward", type=float, default=MEAN_REWARD_BOUND,
#                         help="Mean reward boundary for stop of training, default=%.2f" % MEAN_REWARD_BOUND)
#     args = parser.parse_args()
    print(json.dumps(dict_args, indent=4))
    launch_time = datetime.now()
    
    device = torch.device("cuda" if (args.cuda and torch.cuda.is_available()) else "cpu")

    env = full_lib.make_env(args.env)

    net = full_lib.DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = full_lib.DQN(env.observation_space.shape, env.action_space.n).to(device)
    writer = SummaryWriter(comment="-" + args.env)
    print(net)

    buffer = full_lib.ExperienceBuffer(REPLAY_SIZE)
    agent = full_lib.Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None

    while True:
        frame_idx += 1
        epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

        reward = agent.play_step(net, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print(
                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:" + \
                f" {frame_idx}: done {len(total_rewards)} games," + \
                f" mean reward {mean_reward:.3f}, eps {epsilon:.2f}, speed {speed:.0f} f/s"
            )
#             print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
#                 frame_idx, len(total_rewards), mean_reward, epsilon,
#                 speed
#             ))
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)
            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), args.env + "-best.dat")
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
                best_mean_reward = mean_reward
            if mean_reward > args.reward:
                print("Solved in %d frames!" % frame_idx)
                break
            
            time_passed = datetime.now() - launch_time
            if time_passed.seconds / 60 > TIME_LIMIT:
                print(f"Exiting by time limit: {time_passed.seconds / 60} minutes out of {TIME_LIMIT}")
                break

        if len(buffer) < REPLAY_START_SIZE:
            continue

        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())

        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, device=device)
        loss_t.backward()
        optimizer.step()
    writer.close()


{
    "cuda": true,
    "time_limit": 120,
    "env": "Qbert-v0",
    "reward": 2500,
    "batch_size": 32,
    "replay_size": 10000,
    "learning_rate": 0.0001,
    "sync_target_frames": 1000,
    "replay_start_size": 10000,
    "eplison_decay_last_frame": 100000,
    "epsilon_start": 1.0,
    "epsilon_final": 0.02
}
DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)
2020-11-29 21:04:22: 97: done 1 games, mean reward 175.000, eps 1.00, speed 241 f/s
2020-11-29 21:04:23: 206: done 2 games, mean reward 125.000, eps 1.00, speed 232 f/s
2020-11-29 21:04:23: 319: done 3 games, mean reward 116.667, eps 1.00, speed 255 f/s
2020-

  app.launch_new_instance()


2020-11-29 21:05:06: 10054: done 89 games, mean reward 148.034, eps 0.90, speed 108 f/s
2020-11-29 21:05:07: 10170: done 90 games, mean reward 147.778, eps 0.90, speed 77 f/s
2020-11-29 21:05:08: 10271: done 91 games, mean reward 146.703, eps 0.90, speed 74 f/s
2020-11-29 21:05:10: 10393: done 92 games, mean reward 146.467, eps 0.90, speed 73 f/s
2020-11-29 21:05:11: 10504: done 93 games, mean reward 147.043, eps 0.89, speed 86 f/s
2020-11-29 21:05:13: 10610: done 94 games, mean reward 146.543, eps 0.89, speed 79 f/s
2020-11-29 21:05:14: 10730: done 95 games, mean reward 146.053, eps 0.89, speed 74 f/s
2020-11-29 21:05:16: 10838: done 96 games, mean reward 145.573, eps 0.89, speed 60 f/s
2020-11-29 21:05:18: 10976: done 97 games, mean reward 148.711, eps 0.89, speed 82 f/s
2020-11-29 21:05:19: 11072: done 98 games, mean reward 147.194, eps 0.89, speed 84 f/s
2020-11-29 21:05:20: 11164: done 99 games, mean reward 146.212, eps 0.89, speed 82 f/s
2020-11-29 21:05:21: 11267: done 100 games