In [1]:
import sys; sys.path.append('..') # add project root to the python path

In [2]:
import torch
import gym

from src.part3.MLP import MultiLayerPerceptron as MLP
from src.part5.DQN import DQN, prepare_training_inputs
from src.common.memory.memory import ReplayMemory
from src.common.train_utils import to_tensor

In [3]:
lr = 1e-4 * 5
batch_size = 256
gamma = 1.0
memory_size = 50000
total_eps = 3000
eps_max = 0.08
eps_min = 0.01
sampling_only_until = 2000
target_update_interval = 10

In [4]:
qnet = MLP(4, 2, num_neurons=[128])
qnet_target = MLP(4, 2, num_neurons=[128])

# initialize target network same as the main network.
qnet_target.load_state_dict(qnet.state_dict())
agent = DQN(4, 1, qnet=qnet, qnet_target=qnet_target, lr=lr, gamma=gamma, epsilon=1.0)
env = gym.make('CartPole-v1')
memory = ReplayMemory(memory_size)



In [5]:
print_every = 100

for n_epi in range(total_eps):
    # epsilon scheduling
    # slowly decaying_epsilon
    epsilon = max(eps_min, eps_max - eps_min * (n_epi / 200))
    agent.epsilon = torch.tensor(epsilon)
    s = env.reset()
    cum_r = 0

    while True:
        s = to_tensor(s, size=(1, 4))
        a = agent.get_action(s)
        ns, r, done, info = env.step(a)

        experience = (s,
                      torch.tensor(a).view(1, 1),
                      torch.tensor(r / 100.0).view(1, 1),
                      torch.tensor(ns).view(1, 4),
                      torch.tensor(done).view(1, 1))
        memory.push(experience)

        s = ns
        cum_r += r
        if done:
            break

    if len(memory) >= sampling_only_until:
        # train agent
        sampled_exps = memory.sample(batch_size)
        sampled_exps = prepare_training_inputs(sampled_exps)
        agent.update(*sampled_exps)

    if n_epi % target_update_interval == 0:
        qnet_target.load_state_dict(qnet.state_dict())
    
    if n_epi % print_every == 0:
        msg = (n_epi, cum_r, epsilon)
        print("Episode : {:4.0f} | Cumulative Reward : {:4.0f} | Epsilon : {:.3f}".format(*msg))

Episode :    0 | Cumulative Reward :   16 | Epsilon : 0.080
Episode :  100 | Cumulative Reward :   12 | Epsilon : 0.075
Episode :  200 | Cumulative Reward :    9 | Epsilon : 0.070
Episode :  300 | Cumulative Reward :   49 | Epsilon : 0.065
Episode :  400 | Cumulative Reward :   66 | Epsilon : 0.060
Episode :  500 | Cumulative Reward :   55 | Epsilon : 0.055
Episode :  600 | Cumulative Reward :  275 | Epsilon : 0.050
Episode :  700 | Cumulative Reward :  160 | Epsilon : 0.045
Episode :  800 | Cumulative Reward :   53 | Epsilon : 0.040
Episode :  900 | Cumulative Reward :   67 | Epsilon : 0.035
Episode : 1000 | Cumulative Reward :  147 | Epsilon : 0.030
Episode : 1100 | Cumulative Reward :   61 | Epsilon : 0.025
Episode : 1200 | Cumulative Reward :   54 | Epsilon : 0.020
Episode : 1300 | Cumulative Reward :  221 | Epsilon : 0.015
Episode : 1400 | Cumulative Reward :   85 | Epsilon : 0.010
Episode : 1500 | Cumulative Reward :   31 | Epsilon : 0.010
Episode : 1600 | Cumulative Reward :  50