In [None]:
class DQN(nn.Module):

    def __init__(self, n_actions):
        super(DQN, self).__init__()
        #The input to the network is a 4x84x84 tensor
        input_shape=(4,84,84)
        self.network = nn.Sequential(
        #The first convolution layer convolves the input with 32 filters of size 8 (stride 4)
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
        #the second layer has 64 layers of size 4 (stride 2)
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
        #the final convolution layer has 64 filters of size 3 (stride 1)
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        #followed by a fully-connected hidden layer of 512 units
            nn.Flatten(),
            nn.Linear(7*7*64,512),
        #All these layers are separated by Rectifier Linear Units (ReLu)
            nn.ReLU(),
        #a fully-connected linear layer projects to the output of the network,
        # i.e., the Q-values
            nn.Linear(512, n_actions)
        )

    def forward(self, x):
        y = self.network(x)
        return y

In [None]:
#Hyperparameters for DQN
BATCH_SIZE = 32
#In all experiments, the discount was set to γ = 0.99
GAMMA = 0.99

EPS_START = 1
EPS_END = 0.01
#The simple exploration policy used is an eps-greedy policy with the eps decreasing
# linearly from 1 to 0.1 over 1M steps.
EPS_DECAY = (1000000)/(1-0.1)
TAU = 0.005
#the learn-ing rate to α = 0.00025
LR = 0.00025

#The number of steps between target net-work updates was τ = 10,000
TARGET_UPDATE = 10000

INITIAL_MEMORY = 10000
UPDATE_FREQ = 4

#no_op reset
no_op_action=30

# Get number of actions from gym action space
n_actions = env.action_space.n
frame, _=env.reset()
input_shape=(4,84,84)
n_observations = len(frame)
print(n_observations, n_actions)

In [None]:
policy_net = DQN(n_actions=env.action_space.n).to(device)
target_net = DQN(n_actions=env.action_space.n).to(device)
target_net.load_state_dict(policy_net.state_dict())

In [None]:
#The optimization employed to train the network is RMSProp (with momentum parameter 0.95)
optimizer = optim.RMSprop(policy_net.parameters(), lr=LR, alpha=0.95)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10000, gamma=0.01)
#The size of the experience replay memory is 1M tuples
memory = Memory(1000000)