https://medium.com/mlreview/speeding-up-dqn-on-pytorch-solving-pong-in-30-minutes-81a1bd2dff55

In [2]:
import gym
import argparse
import numpy as np
import ptan
import torch
from torch.autograd import Variable
import torch.optim as optim
from tensorboardX import SummaryWriter
from model import dqn_model, common

In [3]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [6]:
params = common.HYPERPARAMS['pong']
writer = SummaryWriter(comment="-" + params['run_name'] + "-basic")

In [7]:
params

{'batch_size': 32,
 'env_name': 'PongNoFrameskip-v4',
 'epsilon_final': 0.02,
 'epsilon_frames': 100000,
 'epsilon_start': 1.0,
 'gamma': 0.99,
 'learning_rate': 0.0001,
 'replay_initial': 10000,
 'replay_size': 100000,
 'run_name': 'pong',
 'stop_reward': 18.0,
 'target_net_sync': 1000}

In [8]:
env = gym.make(params['env_name'])
env = ptan.common.wrappers.wrap_dqn(env)
net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
if USE_CUDA:
    net.cuda()

In [9]:
tgt_net = ptan.agent.TargetNet(net)
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
epsilon_tracker = common.EpsilonTracker(selector, params)
agent = ptan.agent.DQNAgent(net, selector, cuda=USE_CUDA)

In [10]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

In [11]:
frame_idx = 0

with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
    while True:
        frame_idx += 1
        buffer.populate(1)
        epsilon_tracker.frame(frame_idx)

        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                break

        if len(buffer) < params['replay_initial']:
            continue

        optimizer.zero_grad()
        batch = buffer.sample(params['batch_size'])
        loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=USE_CUDA)
        loss_v.backward()
        optimizer.step()

        if frame_idx % params['target_net_sync'] == 0:
            tgt_net.sync()

896: done 1 games, mean reward -21.000, speed 53.93 f/s, eps 0.99
1966: done 2 games, mean reward -21.000, speed 59.94 f/s, eps 0.98
2861: done 3 games, mean reward -20.667, speed 61.28 f/s, eps 0.97
3723: done 4 games, mean reward -20.500, speed 61.22 f/s, eps 0.96
4621: done 5 games, mean reward -20.400, speed 61.33 f/s, eps 0.95
5498: done 6 games, mean reward -20.500, speed 61.58 f/s, eps 0.95
6381: done 7 games, mean reward -20.571, speed 61.30 f/s, eps 0.94
7697: done 8 games, mean reward -20.250, speed 61.05 f/s, eps 0.92
8456: done 9 games, mean reward -20.333, speed 61.45 f/s, eps 0.92
9279: done 10 games, mean reward -20.400, speed 60.78 f/s, eps 0.91
10155: done 11 games, mean reward -20.455, speed 3.56 f/s, eps 0.90


KeyboardInterrupt: 