In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
   ---------------------------------------- 10.2/958.1 kB ? eta -:--:--
   - ------------------------------------- 41.0/958.1 kB 495.5 kB/s eta 0:00:02
   ----- ---------------------------------- 133.1/958.1 kB 1.1 MB/s eta 0:00:01
   ------------------ --------------------- 450.6/958.1 kB 2.8 MB/s eta 0:00:01
   ----------------------------------- ---- 839.7/958.1 kB 4.1 MB/s eta 0:00:01
   ---------------------------------------  952.3/958.1 kB 4.3 MB/s eta 0:00:01
   ---------------------------------------- 958.1/958.1 kB 3.4 MB/s eta 0:00:00
Downloading Farama_Notification

In [None]:
# 1. make
import gymnasium as gym
env = gym.make('CartPole-v1')

In [5]:
# 2. reset
observation, info = env.reset()

In [None]:
# 3. step
observation, reward, terminated, truncated, info = env.step(action)

In [19]:
# 4. render(model)

import gymnasium as gym
import time

env = gym.make("CartPole-v1", render_mode = "human")
state, info = env.reset()

for i in range(100):
    action = 0
    next_state, reward, terminated, truncated, info = env.step(action)

    env.render()
    time.sleep(0.01)
    state = next_state

    if (terminated):
        state, info = env.reset()

env.close()

In [1]:
# DQN Model

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random

class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(4, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))

        return self.layer3(x)

    def sample_action(self, state, eps):
        out = self.forward(state)
        coin = random.random()
        if (coin < eps):
            return random.randint(0, 1)
        else:
            return out.argmax().item()

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device : {device}")

learning_rate = 0.0005

q_net = DQN().to(device)
q_target_net = DQN().to(device)
q_target_net.load_state_dict(q_net.state_dict())
optimizer = optim.Adam(q_net.parameters(), lr = learning_rate)

device : cpu


In [3]:
# Replay Buffer

import random
import collections

class ReplayBuffer():
    def __init__(self, buffer_limit):
        self.buffer = collections.deque(maxlen = buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)

    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_list, a_list, r_list, s_next_list, done_list = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_next, done = transition
            s_list.append(s)
            a_list.append([a])
            r_list.append([r])
            s_next_list.append(s_next)
            done_list.append([done])

        return torch.tensor(s_list, dtype = torch.float, device = device), \
                torch.tensor(a_list, device = device), \
                torch.tensor(r_list, device = device), \
                torch.tensor(s_next_list, device = device), \
                torch.tensor(done_list, device = device)
    
    def size(self):
        return len(self.buffer)

memory = ReplayBuffer(10000)

In [4]:
# Train Model

batch_size = 128
gamma = 1.0

def model_trainer(q_net, q_target_net, memory):
    for i in range(10):
        state, action, reward, s_next, done = memory.sample(batch_size)

        q_out = q_net(state)
        q_a = q_out.gather(1, action)

        max_q_a_next = q_target_net(s_next).max(1)[0].unsqueeze(1)
        target = reward + gamma * max_q_a_next * done

        loss = F.smooth_l1_loss(q_a, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [5]:
import gymnasium as gym
env = gym.make("CartPole-v1")

step_count = 1
score = 0
print_interval = 20

for n_epi in range(1500):
    epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))
    s, info = env.reset()

    done = False
    while not done:
        a = q_net.sample_action(torch.from_numpy(s).float().to(device), epsilon)

        s_next, reward, done, truncated, info = env.step(a)
        done_mask = 0.0 if done else 1.0

        memory.put((s, a, reward, s_next, done_mask))
        s = s_next
        score += reward

        if (score / step_count > 1000):
            break

        if (done):
            break

    step_count += 1
    if (memory.size() > 2000):
        model_trainer(q_net, q_target_net, memory)

    if (n_epi % print_interval == 0) and (n_epi > 0):
        q_target_net.load_state_dict(q_net.state_dict())
        print("n_episode : {}, score : {}, n_buffer : {}, eps : {:.1f}%".format(n_epi, score / print_interval, memory.size(), epsilon * 100))

        step_count = 1
        score = 0

    env.close()

n_episode : 20, score : 41.5, n_buffer : 830, eps : 7.9%
n_episode : 40, score : 33.8, n_buffer : 1506, eps : 7.8%


  return torch.tensor(s_list, dtype = torch.float, device = device), \


n_episode : 60, score : 38.5, n_buffer : 2276, eps : 7.7%
n_episode : 80, score : 24.75, n_buffer : 2771, eps : 7.6%
n_episode : 100, score : 20.45, n_buffer : 3180, eps : 7.5%
n_episode : 120, score : 33.9, n_buffer : 3858, eps : 7.4%
n_episode : 140, score : 45.0, n_buffer : 4758, eps : 7.3%
n_episode : 160, score : 55.6, n_buffer : 5870, eps : 7.2%
n_episode : 180, score : 221.0, n_buffer : 10000, eps : 7.1%
n_episode : 200, score : 243.85, n_buffer : 10000, eps : 7.0%
n_episode : 220, score : 243.15, n_buffer : 10000, eps : 6.9%
n_episode : 240, score : 238.95, n_buffer : 10000, eps : 6.8%
n_episode : 260, score : 248.15, n_buffer : 10000, eps : 6.7%
n_episode : 280, score : 233.75, n_buffer : 10000, eps : 6.6%
n_episode : 300, score : 300.55, n_buffer : 10000, eps : 6.5%
n_episode : 320, score : 248.65, n_buffer : 10000, eps : 6.4%
n_episode : 340, score : 228.75, n_buffer : 10000, eps : 6.3%
n_episode : 360, score : 187.85, n_buffer : 10000, eps : 6.2%
n_episode : 380, score : 22

In [6]:
# result

import time
env = gym.make("CartPole-v1", render_mode = "human")
state, info = env.reset()
state = torch.tensor(state, dtype = torch.float32, device = device).unsqueeze(0)

for i in range(500):
    action = q_net.sample_action(state, epsilon)
    next_state, reward, terminated, truncated, info = env.step(action)

    env.render()

    time.sleep(0.01)

    state = next_state
    state = torch.tensor(state, dtype = torch.float32, device = device).unsqueeze(0)
    if (terminated):
        state, info = env.reset()
        state = torch.tensor(state, dtype = torch.float32, device = device).unsqueeze(0)

env.close()