In [1]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

In [2]:
GAMMA = 0.99                                               
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
MAX_EP = 25000

REWARD_ACC = list()
LOSS_ACC = list()

torch.manual_seed(1234)
np.random.seed(1234)

In [3]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()

        in_features = int(np.prod(env.observation_space.shape))

        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64, env.action_space.n)
        )

    def forward(self, x):
        return self.net(x)

    def act(self, state):

        state_t = torch.as_tensor(state, dtype=torch.float32)

        q_values = self.forward(state_t.unsqueeze(0))                           # 'q_values' outputs two values (left or right)

        max_q_index = torch.argmax(q_values, dim=1)[0]                          # find an index that corresponds to the maximum value  

        action = max_q_index.detach().item()                                    # 0 or 1

        return action                                                           # 0 or 1

In [4]:
env = gym.make('CartPole-v1')
episode_reward = 0.0
episode = 0 
reward_buffer = deque([0.0], maxlen=100)

net = Network(env)

optimizer = torch.optim.Adam(net.parameters(), lr=1e-1)

In [5]:
# Main Training Loop

state = env.reset()                                                             # state의 예시: [-0.01713841 -0.00705756 -0.04146662 -0.04927411]

for step in itertools.count():                                                  # step starts from 0 and increases by 1 until it meets a break condition. This is same as 'While True' loop

    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END]) # epsilon value는 EPSILON_START에서 시작해서 EPSILON_END까지 step이 흘러갈수록 점점 더 감소한다. 

    random_sample = random.random()

    if random_sample <= epsilon:                                                # random_sample의 값이 epsilon보다 작으면, random한 action을 취하고, 그렇지 않다면 online_net에 현재 state를 넣어 가장 좋은 act를 가져온다. 
        action = env.action_space.sample()
    else:
        action = net.act(state)

    new_state, reward, done, _ = env.step(action)
                                 
    state = new_state

    episode_reward  = episode_reward + reward

    if done:                             
        state = env.reset()      
        reward_buffer.append(episode_reward)
        episode_reward = 0.0

    # # -------------------------- TEST --------------------------
    # # After solved, watch it play
    # if len(reward_buffer) == 100:
    #     if np.mean(reward_buffer) >= 195:
    #         while True:
    #             action = online_net.act(state)

    #             action, _, done, _ = env.step(action)
    #             env.render()
    #             if done:
    #                 env.reset()
    # # -------------------------- TEST --------------------------

    state_t     = torch.as_tensor(state, dtype=torch.float32)
    action_t    = torch.as_tensor(action, dtype=torch.int64).unsqueeze(-1)
    reward_t    = torch.as_tensor(reward, dtype=torch.float32).unsqueeze(-1)
    done_t      = torch.as_tensor(done, dtype=torch.float32).unsqueeze(-1)
    new_state_t = torch.as_tensor(new_state, dtype=torch.float32)

    # Compute Targets
    target_q_values = net.forward(new_state_t)
    max_target_q_values = target_q_values.max(dim=0, keepdim=True)[0]           
    targets = reward_t + GAMMA * (1 - done_t) * max_target_q_values     

    # Compute Loss
    q_values = net.forward(state_t)                             
    action_q_values = torch.gather(input=q_values, dim=0, index=action_t)      

    # print("targets:", targets)
    # print("q_values:", q_values)
    # print("actions_q_values", action_q_values)

    loss = nn.functional.smooth_l1_loss(action_q_values, targets)

    # Gradient Descent
    optimizer.zero_grad()
    loss.backward()
    optimizer.step() 
    
    # Logging
    if step % 1000 == 0:
        print()
        print('Step', step)
        print('Avg Reward', np.mean(reward_buffer))                             # maximum length of reward_buffer is 100. Therefore, np.mean(reward_buffer) averages lastest 100 rewards
        print('Loss', loss)
        REWARD_ACC.append(np.mean(reward_buffer))
        LOSS_ACC.append(loss.item())
        
    if step == MAX_EP:
        break


Step 0
Avg Reward 0.0
Loss tensor(0.5001, grad_fn=<SmoothL1LossBackward0>)

Step 1000
Avg Reward 20.224489795918366
Loss tensor(0.4838, grad_fn=<SmoothL1LossBackward0>)

Step 2000
Avg Reward 20.905263157894737
Loss tensor(9.2342, grad_fn=<SmoothL1LossBackward0>)

Step 3000
Avg Reward 20.75
Loss tensor(1.7095, grad_fn=<SmoothL1LossBackward0>)

Step 4000
Avg Reward 19.41
Loss tensor(0.4398, grad_fn=<SmoothL1LossBackward0>)

Step 5000
Avg Reward 17.34
Loss tensor(0.4080, grad_fn=<SmoothL1LossBackward0>)

Step 6000
Avg Reward 17.27
Loss tensor(0.4203, grad_fn=<SmoothL1LossBackward0>)

Step 7000
Avg Reward 17.84
Loss tensor(1.7238, grad_fn=<SmoothL1LossBackward0>)

Step 8000
Avg Reward 14.1
Loss tensor(0.3309, grad_fn=<SmoothL1LossBackward0>)

Step 9000
Avg Reward 12.5
Loss tensor(0.4731, grad_fn=<SmoothL1LossBackward0>)

Step 10000
Avg Reward 11.13
Loss tensor(0.3230, grad_fn=<SmoothL1LossBackward0>)

Step 11000
Avg Reward 9.47
Loss tensor(0.1631, grad_fn=<SmoothL1LossBackward0>)

Step 12

In [6]:
with open('DQN.txt', 'w') as f:
    f.write(str(REWARD_ACC))
    f.write("\n")
    f.write(str(LOSS_ACC))