In [None]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

In [None]:
GAMMA = 0.99                                               
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000

In [None]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()

        in_features = int(np.prod(env.observation_space.shape))

        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64, env.action_space.n)
        )

    def forward(self, x):
        return self.net(x)

    def act(self, state):

        state_t = torch.as_tensor(state, dtype=torch.float32)

        q_values = self.forward(state_t.unsqueeze(0))                           # 'q_values' outputs two values (left or right)

        max_q_index = torch.argmax(q_values, dim=1)[0]                          # find an index that corresponds to the maximum value  

        action = max_q_index.detach().item()                                    # 0 or 1

        return action                                                           # 0 or 1

In [None]:
env = gym.make('CartPole-v1')
episode_reward = 0.0
episode = 0 

net = Network(env)

optimizer = torch.optim.Adam(net.parameters(), lr=1e-1)

In [None]:
# Main Training Loop

state = env.reset()                                                             # state의 예시: [-0.01713841 -0.00705756 -0.04146662 -0.04927411]

for step in itertools.count():                                                  # step starts from 0 and increases by 1 until it meets a break condition. This is same as 'While True' loop

    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END]) # epsilon value는 EPSILON_START에서 시작해서 EPSILON_END까지 step이 흘러갈수록 점점 더 감소한다. 

    random_sample = random.random()

    if random_sample <= epsilon:                                                # random_sample의 값이 epsilon보다 작으면, random한 action을 취하고, 그렇지 않다면 online_net에 현재 state를 넣어 가장 좋은 act를 가져온다. 
        action = env.action_space.sample()
    else:
        action = net.act(state)

    new_state, reward, done, _ = env.step(action)
                                 
    state = new_state

    episode_reward  = episode_reward + reward

    if done:                             

        print(f"Episode: {episode} | Reward: {episode_reward}")

        state = env.reset()                                                     
        episode_reward = 0.0
        episode += 1

    # # -------------------------- TEST --------------------------
    # # After solved, watch it play
    # if len(reward_buffer) == 100:
    #     if np.mean(reward_buffer) >= 195:
    #         while True:
    #             action = online_net.act(state)

    #             action, _, done, _ = env.step(action)
    #             env.render()
    #             if done:
    #                 env.reset()
    # # -------------------------- TEST --------------------------

    state_t     = torch.as_tensor(state, dtype=torch.float32)
    action_t    = torch.as_tensor(action, dtype=torch.int64).unsqueeze(-1)
    reward_t    = torch.as_tensor(reward, dtype=torch.float32).unsqueeze(-1)
    done_t      = torch.as_tensor(done, dtype=torch.float32).unsqueeze(-1)
    new_state_t = torch.as_tensor(new_state, dtype=torch.float32)

    # Compute Targets
    target_q_values = net.forward(new_state_t)
    max_target_q_values = target_q_values.max(dim=0, keepdim=True)[0]           
    targets = reward_t + GAMMA * (1 - done_t) * max_target_q_values     

    # Compute Loss
    q_values = net.forward(state_t)                             
    action_q_values = torch.gather(input=q_values, dim=0, index=action_t)      
    loss = nn.functional.smooth_l1_loss(action_q_values, targets)

    # Gradient Descent
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()   

Episode: 0 | Reward: 14.0
Episode: 1 | Reward: 32.0
Episode: 2 | Reward: 13.0
Episode: 3 | Reward: 19.0
Episode: 4 | Reward: 11.0
Episode: 5 | Reward: 22.0
Episode: 6 | Reward: 13.0
Episode: 7 | Reward: 13.0
Episode: 8 | Reward: 15.0
Episode: 9 | Reward: 27.0
Episode: 10 | Reward: 31.0
Episode: 11 | Reward: 38.0
Episode: 12 | Reward: 31.0
Episode: 13 | Reward: 20.0
Episode: 14 | Reward: 14.0
Episode: 15 | Reward: 9.0
Episode: 16 | Reward: 14.0
Episode: 17 | Reward: 33.0
Episode: 18 | Reward: 31.0
Episode: 19 | Reward: 8.0
Episode: 20 | Reward: 67.0
Episode: 21 | Reward: 11.0
Episode: 22 | Reward: 9.0
Episode: 23 | Reward: 32.0
Episode: 24 | Reward: 27.0
Episode: 25 | Reward: 19.0
Episode: 26 | Reward: 22.0
Episode: 27 | Reward: 15.0
Episode: 28 | Reward: 11.0
Episode: 29 | Reward: 12.0
Episode: 30 | Reward: 12.0
Episode: 31 | Reward: 13.0
Episode: 32 | Reward: 37.0
Episode: 33 | Reward: 25.0
Episode: 34 | Reward: 22.0
Episode: 35 | Reward: 10.0
Episode: 36 | Reward: 15.0
Episode: 37 | 

KeyboardInterrupt: ignored