In [None]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]

Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[?25l[K     |▊                               | 10 kB 15.3 MB/s eta 0:00:01[K     |█▌                              | 20 kB 9.6 MB/s eta 0:00:01[K     |██▏                             | 30 kB 8.4 MB/s eta 0:00:01[K     |███                             | 40 kB 7.9 MB/s eta 0:00:01[K     |███▋                            | 51 kB 6.7 MB/s eta 0:00:01[K     |████▍                           | 61 kB 7.8 MB/s eta 0:00:01[K     |█████▏                          | 71 kB 8.4 MB/s eta 0:00:01[K     |█████▉                          | 81 kB 8.2 MB/s eta 0:00:01[K     |██████▋                         | 92 kB 9.0 MB/s eta 0:00:01[K     |███████▎                        | 102 kB 9.1 MB/s eta 0:00:01[K     |████████                        | 112 kB 9.1 MB/s eta 0:00:01[K     |████████▊                       | 122 kB 9.1 MB/s eta 0:00:01[K     |█████████▌                      | 133 kB 9.1 MB/s

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym
from tqdm.notebook import tqdm

In [None]:
class DeepQNetwork(nn.Module):
  def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
    super(DeepQNetwork, self).__init__()

    self.input_dims = input_dims
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.n_actions = n_actions
    self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
    self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
    self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
    self.optimizer = optim.Adam(self.parameters(), lr = lr)
    self.loss = nn.MSELoss()
    self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    self.to(self.device)
  
  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.relu(self.fc2(x))
    actions = self.fc3(x)

    return actions

In [None]:
class Agent():
  def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, max_memory_size = 10000, eps_end = 0.01, eps_dec = 5e-4):
    self.gamma = gamma
    self.epsilon = epsilon
    self.eps_min = eps_end
    self.eps_dec = eps_dec
    self.lr = lr

    
    self.action_space = []
    self.action_space = [i for i in range(n_actions)]

    self.mem_size = max_memory_size
    self.batch_size = batch_size
    self.mem_cntr = 0


    self.Q_eval = DeepQNetwork(self.lr, input_dims=input_dims, fc1_dims=256, fc2_dims=256, n_actions=n_actions)

    self.state_memory = np.zeros((self.mem_size, *input_dims), dtype = np.float32)
    self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype =np.float32)
    self.action_memory = np.zeros(self.mem_size, dtype = np.int32)
    self.reward_memory = np.zeros(self.mem_size, dtype = np.float32)
    self.terminal_memory = np.zeros(self.mem_size, dtype = np.bool)
  
  def store_transition(self, state, action, reward, state_, done):
    index = self.mem_cntr % self.mem_size # memory size 모두 채우면 다시 0번 index로 돌아가서 다시 채움
    self.state_memory[index] = state
    self.new_state_memory[index] = state_
    self.reward_memory[index] = reward
    self.action_memory[index] = action
    self.terminal_memory[index] = done
    self.mem_cntr += 1

  def choose_action(self, observation):
    if np.random.random() > self.epsilon:
        state = torch.tensor([observation]).to(self.Q_eval.device)
        actions = self.Q_eval.forward(state)
        action = torch.argmax(actions).item()
    else:
      action = np.random.choice(self.action_space)
    
    return action
  
  def learn(self):
    if self.mem_cntr < self.batch_size:
      return
    
    self.Q_eval.optimizer.zero_grad()
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, self.batch_size, replace=False)
    batch_index = np.arange(self.batch_size, dtype=np.int32)

    state_batch = torch.tensor(self.state_memory[batch]).to(self.Q_eval.device)
    new_state_batch = torch.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
    reward_batch = torch.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
    terminal_batch = torch.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

    action_batch = self.action_memory[batch]

    q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
    q_next = self.Q_eval.forward(new_state_batch)
    q_next[terminal_batch] = 0.0

    q_target = reward_batch + self.gamma *torch.max(q_next, dim=1)[0]

    loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
    loss.backward()
    self.Q_eval.optimizer.step()

    self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

In [None]:
env = gym.make('LunarLander-v2')

agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, input_dims=[8], lr=0.03)
scores, eps_history = [], []
n_games = 500

for i in range(n_games):
  score = 0
  done = False
  observation = env.reset()
  while not done:
    action = agent.choose_action(observation)
    observation_, reward, done, info = env.step(action)
    score+=reward
    agent.store_transition(observation, action, reward, observation_, done)
    agent.learn()
    observation = observation_
  scores.append(score)
  eps_history.append(agent.epsilon)

  avg_score = np.mean(scores[100:])
  print('epsode :', i, 'score %.2f' %score, 
        'average scroe %.2f' %avg_score,
        'epsilon %.2f' %agent.epsilon )

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


epsode : 0 score -147.97 average scroe nan epsilon 1.00
epsode : 1 score -242.60 average scroe nan epsilon 0.94
epsode : 2 score -129.72 average scroe nan epsilon 0.89
epsode : 3 score -125.69 average scroe nan epsilon 0.84
epsode : 4 score -291.08 average scroe nan epsilon 0.80
epsode : 5 score -229.53 average scroe nan epsilon 0.73
epsode : 6 score -181.57 average scroe nan epsilon 0.68
epsode : 7 score -109.12 average scroe nan epsilon 0.61
epsode : 8 score -182.50 average scroe nan epsilon 0.57
epsode : 9 score -134.93 average scroe nan epsilon 0.51
epsode : 10 score -216.13 average scroe nan epsilon 0.41
epsode : 11 score -228.03 average scroe nan epsilon 0.30
epsode : 12 score -274.81 average scroe nan epsilon 0.18
epsode : 13 score -100.12 average scroe nan epsilon 0.11
epsode : 14 score -131.87 average scroe nan epsilon 0.03
epsode : 15 score -73.13 average scroe nan epsilon 0.01
epsode : 16 score -163.16 average scroe nan epsilon 0.01
epsode : 17 score -82.63 average scroe nan