In [4]:
import gymnasium as gym
import os
import math
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
from gymnasium.wrappers import RecordVideo
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from gymnasium.envs.registration import register
from gymnasium.envs.classic_control.acrobot import AcrobotEnv

# 파라미터
BATCH_SIZE = 128        # 128
GAMMA = 0.99            # 0.99
EPS_START = 1.0         # 0.9,    -> 1.0
EPS_END = 0.1           # 0.05,   -> 0.1
EPS_DECAY = 20000       # 1000,   -> 20000
TAU = 0.005             # 0.005
LR = 5e-4               # 1e-4,   -> 5e-4
memory_capacity = 50000 # 10000   -> 50000
NEURON = 256            # 128     -> 256
update_cycle = 1        # 8
iter_epi = 1000         # 600     -> 1000
LINK1 = 1.0             # 1.0     -> 1.5, 0.5
LINK2 = 1.0             # 1.0     -> 0.5, 1.5
# OPTIMIZER = AdamW(adam)
# LOSS FUNCTION = Smooth L1 Loss(MSE)
# ACTIVATE FUNCTION = LeakyReLU(relu)
# MAX STEPS: 500


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if GPU is to be used
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

print(device)

cpu


In [6]:
# 환경 수정: link 길이 수정
class CustomAcrobotEnv(AcrobotEnv):
    def __init__(self, link1_length=1.0, link2_length=1.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.LINK_LENGTH_1 = link1_length
        self.LINK_LENGTH_2 = link2_length

register(
    id='CustomAcrobot-v0',
    entry_point=__name__ + ':CustomAcrobotEnv',
    kwargs={'link1_length': LINK1, 'link2_length': LINK2},  # Set your desired lengths
    max_episode_steps=500
)

# Load the custom environment
env = gym.make('CustomAcrobot-v0', render_mode='rgb_array')
env = RecordVideo(
    env, "./DDQN-videos", episode_trigger=lambda x: x in [1, 50, 250, 1000]
)
print(env.spec)

EnvSpec(id='CustomAcrobot-v0', entry_point='__main__:CustomAcrobotEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=500, order_enforce=True, disable_env_checker=False, kwargs={'link1_length': 1.0, 'link2_length': 1.0, 'render_mode': 'rgb_array'}, namespace=None, name='CustomAcrobot', version=0, additional_wrappers=(WrapperSpec(name='RecordVideo', entry_point='gymnasium.wrappers.rendering:RecordVideo', kwargs={'video_folder': './DDQN-videos', 'episode_trigger': <function <lambda> at 0x7fe4b9bdc040>, 'step_trigger': None, 'video_length': 0, 'name_prefix': 'rl-video', 'disable_logger': True}),), vector_entry_point=None)


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [7]:
# Set seed
SEED = 42

env.action_space.seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fe56bd09450>

In [8]:
def plot_rewards(show_result=False):
    plt.figure(figsize=(10, 6))
    rewards_t = torch.tensor(episode_rewards, dtype=torch.float)

    plt.clf()
    plt.title('Result')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.plot(rewards_t.numpy(), label='Episode Reward')

    # 100 에피소드 이동 평균 계산 및 플롯
    if len(rewards_t) >= 100:
        means = rewards_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy(), label='100 Episode Avg', linestyle='--')

    plt.legend()
    plt.pause(0.001)

    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())
            
def save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards, path="models"):
    if not os.path.exists(path):
        os.makedirs(path)
    save_path = os.path.join(path, f"policy_net_episode_{episode}.pth")
    torch.save({
        'episode': episode,
        'model_state_dict': policy_net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'steps_done': steps_done,  # 탐험 상태 저장
        'memory': list(memory.memory), # 경험 저장
        'priorities': list(memory.priorities), # 우선순위 저장
        'episode_rewards': episode_rewards
    }, save_path)
    print(f"Model saved to {save_path}")


def load_model(policy_net, optimizer, path):
    checkpoint = torch.load(path)

    # 모델 가중치와 옵티마이저 복원
    policy_net.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # 에피소드 및 탐험상태 복원
    episode = checkpoint['episode']
    steps_done = checkpoint.get('steps_done', 0)  # 기본값 0

    # 리플레이 메모리복원
    memory_data = checkpoint.get('memory', [])
    priority_data = checkpoint.get('priorities', [])
    if memory_data and priority_data:
        memory.memory = deque(memory_data, maxlen=memory.capacity)
        memory.priorities = deque(priority_data, maxlen=memory.capacity)

    # 에피소드별 보상 복원
    episode_rewards = checkpoint.get('episode_rewards', [])

    print(f"Model loaded from {path}, starting from episode {episode}, steps_done={steps_done}")
    return episode, steps_done, episode_rewards

In [9]:
# DQN Network
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions, negative_slope=0.01):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, NEURON)
        self.layer2 = nn.Linear(NEURON, NEURON)
        self.layer3 = nn.Linear(NEURON, n_actions)
        self.negative_slope = negative_slope  # LeakyReLU parameter

    def forward(self, x):
        x = F.leaky_relu(self.layer1(x), negative_slope=self.negative_slope)
        x = F.leaky_relu(self.layer2(x), negative_slope=self.negative_slope)
        return self.layer3(x)

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

# ReplayMemor
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args)) 

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size) 

    def __len__(self):
        return len(self.memory)

In [10]:
# Optimize
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)


    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        # DQN ===========================
        # next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
        # DQN ===========================
        
        # Double DQN ====================
        next_actions = policy_net(non_final_next_states).max(1).indices.unsqueeze(1)
        next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_actions).squeeze(1)
        # Double DQN ====================

    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1

    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1, 1) 
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long) 

In [11]:
n_actions = env.action_space.n # action: 3

state, info = env.reset()
n_observations = len(state) # state: 6

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
# target_net.eval()

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(memory_capacity)

steps_done = 0

episode_rewards = []
score = 0.0

if torch.cuda.is_available() or torch.backends.mps.is_available():
    num_episodes = iter_epi
else:
    num_episodes = 50

In [12]:
# # 비디오 녹화 코드 예시
# base_env = gym.make("ALE/IceHockey-v5", render_mode="rgb_array")
# env = RecordVideo(base_env, video_folder="./videos", disable_logger=True)
# done = False

# obs, info = env.reset()
# t = 0
# max_steps = 200

# # Simulate an episode
# while not done:

#     # Take a random action
#     action = env.action_space.sample()
#     new_obs, reward, terminated, truncated, info = env.step(action)

#     done = terminated or truncated or t > max_steps
#     t += 1

# # Close environment
# env.close()

# # Render recording
# widgets.Video.from_file(
#     f"./videos/rl-video-episode-0.mp4", autoplay=False, loop=False, width=700
# )

In [13]:
try:
    for episode in range(num_episodes+1):

        state, info = env.reset(seed=SEED)
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

        episode_reward = 0

        for t in count(): # 무한 반복
            if episode < 10:  # 초기 10 에피소드는 무조건 랜덤 행동
                action = torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
            else:
                action = select_action(state)

            next_state, reward, terminated, truncated, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)
            done = terminated or truncated

            episode_reward += reward.item()
            
            score += reward.item()

            if not done:
                next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)
            else:
                next_state = None

            memory.push(state, action, next_state, reward)

            state = next_state

            # if t % 4 == 0:
            optimize_model()

            # 8 스텝마다 target 복사(업데이트)
            if t % update_cycle == 0:
              target_net_state_dict = target_net.state_dict()
              policy_net_state_dict = policy_net.state_dict()
              for key in policy_net_state_dict:
                  target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
              target_net.load_state_dict(target_net_state_dict)

            if done:
              episode_rewards.append(episode_reward)
              break

        # 100 에피소드마다 모델 저장
        # if episode % 100 == 0:
            # save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards)

        # 10번마다 그래프 업데이트
        if episode % 10 == 0 and episode != 0:
            plot_rewards()
        #     상세 결과 출력
        #     print("n_episode :{}, score : {:.1f}, n_buffer : {}".format(episode, score/10, len(memory)))
        #     episode_rewards.append(score/10)
        #     score = 0.0
            

    print('Complete')
    plot_rewards(show_result=True)
    plt.ioff()
    plt.show()
    print('Average Reward: {}'.format(sum(episode_rewards)/num_episodes))

except KeyboardInterrupt:
    print("Training interrupted. Saving the current model...")
    # save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards)
    env.close()
    print("Model saved. Exiting.")

  torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)


Training interrupted. Saving the current model...
Model saved. Exiting.


In [None]:
episode_rewards

[-500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -371.0,
 -309.0,
 -500.0,
 -458.0,
 -342.0,
 -326.0,
 -414.0,
 -356.0,
 -375.0,
 -413.0,
 -391.0,
 -263.0,
 -176.0,
 -178.0,
 -201.0,
 -233.0,
 -295.0,
 -241.0,
 -169.0,
 -279.0,
 -193.0,
 -270.0,
 -154.0,
 -225.0,
 -146.0,
 -201.0,
 -179.0,
 -111.0,
 -288.0,
 -297.0,
 -217.0,
 -240.0,
 -175.0,
 -170.0,
 -204.0,
 -212.0,
 -117.0,
 -136.0,
 -149.0,
 -183.0,
 -206.0,
 -146.0,
 -119.0,
 -171.0,
 -164.0,
 -109.0,
 -208.0,
 -205.0,
 -169.0,
 -128.0,
 -143.0,
 -144.0,
 -187.0,
 -146.0,
 -166.0,
 -145.0,
 -131.0,
 -123.0,
 -164.0,
 -168.0,
 -115.0,
 -155.0,
 -106.0,
 -163.0,
 -151.0,
 -129.0,
 -143.0,
 -111.0,
 -139.0,
 -122.0,
 -500.0,
 -123.0,
 -133.0,
 -435.0,
 -112.0,
 -152.0,
 -123.0,
 -176.0,
 -153.0,
 -115.0,
 -117.0,
 -135.0,
 -157.0,
 -92.0,
 -92.0,
 -102.0,
 -123.0,
 -156.0,
 -138.0,
 -170.0,
 -122.0,
 -124.0,
 -144.0,
 -206.0,
 -199.0,
 -138.0,
 -1