In [1]:
import torch
try:
    import torch_directml
    global device
    device = torch_directml.device()
except (NameError, ModuleNotFoundError):
    import intel_npu_acceleration_library
    from intel_npu_acceleration_library import compile

    device = torch.device("cpu")
except (NameError, ModuleNotFoundError):
    device = torch.device(
        "cuda"
        if torch.cuda.is_available()
        else "mps" if torch.backends.mps.is_available() else "cpu"
    )

In [2]:
import gymnasium as gym  # As a best practice, Gymnasium is usually imported as 'gym'
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import matplotlib
import matplotlib.pyplot as plt
from gymnasium.wrappers import RecordVideo
from collections import namedtuple, deque
from itertools import count
import ipywidgets as widgets
import warnings


warnings.filterwarnings('ignore')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

env = gym.make('Acrobot-v1')
print(env.spec)

plt.ion() # matplotlib를 interactive mode로 설정 -> 그래프를 실시간으로 업데이트할 수 있도록 함
device

EnvSpec(id='Acrobot-v1', entry_point='gymnasium.envs.classic_control.acrobot:AcrobotEnv', reward_threshold=-100.0, nondeterministic=False, max_episode_steps=500, order_enforce=True, disable_env_checker=False, kwargs={}, namespace=None, name='Acrobot', version=1, additional_wrappers=(), vector_entry_point=None)


device(type='privateuseone', index=0)

In [3]:
def plot_rewards(show_result=False):
    plt.figure(1)
    rewards_t = torch.tensor(episode_rewards, dtype=torch.float)
    
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.plot(rewards_t.numpy(), label='Episode Reward')
    
    # 100 에피소드 이동 평균 계산 및 플롯
    if len(rewards_t) >= 100:
        means = rewards_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy(), label='100 Episode Avg', linestyle='--')
    
    plt.legend()
    plt.pause(0.001)

    if is_ipython:
        display.display(plt.gcf())
        display.clear_output(wait=True)


In [4]:
class DuelingRQN(nn.Module):
    def __init__(self, n_observations, n_actions, n_sequence):
        super(DuelingRQN, self).__init__()
        # input: (N, seq, 4), output: (output(N, seq, h_out), hiddenstate(layers, 32, h_out)). seq는 일단 1로 
        # hidden state: (layers, seq, hidden_size)
        self.rnn = nn.RNN(n_observations, hidden_size=48, num_layers=3, batch_first=True) 
        # DuelingDQN 적용
        self.flatten = nn.Flatten()
        self.fc_value = nn.Linear(48, 1)
        self.fc_advantage = nn.Linear(48, n_actions)
    def forward(self, x):
        out, hidden = self.rnn(x) # (BATCH, seq, 24), (3, BATCH, 24)
        out = self.flatten(F.tanh(out))
        # 상태값과 행동 이점 계산
        value = self.fc_value(out)
        advantage = self.fc_advantage(out)

        # 최종 Q값 계산
        q_values = value + advantage
        return q_values

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

In [5]:
# 학습 파라미터
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.1
EPS_DECAY = 10000 # 크게 할수록 캄험이 더 오래 지속 -> 새로운 더 많이 탐색
TAU = 0.005
LR = 1e-4
SEED = 42

n_actions = env.action_space.n # 3

n_observations = env.observation_space._shape[0]
n_sequence = 16

# 입력 텐서 형식은 (배치 크기, 채널 수, 높이, 너비)
policy_net = DuelingRQN(n_observations, n_actions, n_sequence).to(device)
target_net = DuelingRQN(n_observations, n_actions, n_sequence).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR, amsgrad=True)
train_data: list[Transition] = []

steps_done = 0

episode_rewards = []

if torch.cuda.is_available() or torch.backends.mps.is_available():
    num_episodes = 7200
else:
    num_episodes = 600

env.action_space.seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f7a481ad0b0>

In [6]:
def optimize_model(train_data, loss_fn, optimizer):
    batch = Transition(*zip(*train_data)) 
    # 다음 상태 중 None이 아닌 것만 선택
    non_final_mask = torch.tensor(
        tuple(map(lambda s: s is not None, batch.next_state)),
        device=device,
        dtype=torch.bool,
    )
    non_final_next_states = torch.stack(
        [s for s in batch.next_state if s is not None]
    ).to(device)
    # 모든 상태를 (배치 크기, 채널 수) 형식으로 연결
    state_batch = torch.cat(batch.state).to(device)  # (BATCH_SIZE, 6)
    action_batch = torch.cat(batch.action).to(device) # (BATCH_SIZE, )
    reward_batch = torch.cat(batch.reward).to(device)  # 배치 처리를 최대한 병렬로 처리

    # Q값 계산
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # 다음 상태의 Q값 계산
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        # Double DQN으로 수정
        next_actions = policy_net(non_final_next_states).max(1).indices.unsqueeze(1)
        next_state_values[non_final_mask] = (
            target_net(non_final_next_states).gather(1, next_actions).squeeze(1)
        )
    # 기대 Q값 계산
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Huber 손실 함수와 중요도 가중치 적용
    loss = loss_fn(state_action_values, expected_state_action_values.unsqueeze(1))  # reduction="none"
    # 모델 최적화
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(
        -1.0 * steps_done / EPS_DECAY
    )
    steps_done += 1

    state = state.to(device)

    # 초기 eps_threshold는 1.0이다. steps_done이 증가함에 따라 점진적으로 감소하여 결국 Q-value가 가장 큰 행동을 선택.
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1, 1)

    else:
        return torch.tensor(
            [[env.action_space.sample()]], device=device, dtype=torch.long
        )

In [7]:
def save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards, path="models"):
    if not os.path.exists(path):
        os.makedirs(path)
    save_path = os.path.join(path, f"policy_net_episode_{episode}.pth")
    torch.save({
        'episode': episode,
        'model_state_dict': policy_net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'steps_done': steps_done,  # 탐험 상태 저장
        'episode_rewards': episode_rewards
    }, save_path)
    print(f"Model saved to {save_path}")


def load_model(policy_net, optimizer, path):
    checkpoint = torch.load(path)

    # 모델 가중치와 옵티마이저 복원
    policy_net.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # 에피소드 및 탐험상태 복원
    episode = checkpoint['episode']
    steps_done = checkpoint.get('steps_done', 0)  # 기본값 0
    
    # 에피소드별 보상 복원
    episode_rewards = checkpoint.get('episode_rewards', [])

    print(f"Model loaded from {path}, starting from episode {episode}, steps_done={steps_done}")
    return episode, steps_done, episode_rewards




In [8]:
# 저장된 모델에서 학습 재개
# checkpoint_path = "models/policy_net_episode_4.pth"
# start_episode, steps_done, episode_rewards = load_model(policy_net, optimizer, checkpoint_path)

In [9]:
try:
    loss_fn = nn.SmoothL1Loss()
    optimizer = optim.Adam(policy_net.parameters(), lr=LR)
    memory = deque(maxlen=BATCH_SIZE)
    for episode in range(num_episodes+1):
        state, info = env.reset(seed=42)
        state = torch.tensor(state, device=device).unsqueeze(0)

        episode_reward = 0
        action_counts = np.zeros(env.action_space.n)
        train_data = []

        for t in count(): # 무한 반복
            if episode < 10:  # 초기 10 에피소드는 무조건 랜덤 행동
                action = torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
            else:
                action = select_action(state)

            action_counts[action] += 1
            # 행동을 선택하면, 해당 행동이 4개의 연속된 프레임 동안 환경에 반복적으로 적용
            # 프레임 스킵 동안 발생한 보상이 모두 합산되어 반환, 마지막 프레임의 상태와 종료 여부가 반환
            next_state, reward, terminated, truncated, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)
            episode_reward += reward.item()
            done = terminated or truncated

            if not done:
                next_state = torch.tensor(next_state, device=device).unsqueeze(0)
            else:
                next_state = None

            memory.append((state, action, next_state, reward))
            state = next_state

            # policy network 학습
            if t != 0 and t % BATCH_SIZE == 0:
                optimize_model(memory, loss_fn=loss_fn, optimizer=optimizer)

            if t % (BATCH_SIZE * 4) == 0:
                # 소프트 업데이트 적용 (target network 업데이트) -> 16 프레임마다 학습이 이루어짐
                target_net_state_dict = target_net.state_dict()
                policy_net_state_dict = policy_net.state_dict()
                for key in policy_net_state_dict:
                    target_net_state_dict[key] = policy_net_state_dict[key] * TAU + target_net_state_dict[key] * (1-TAU)
                target_net.load_state_dict(target_net_state_dict)

            if done:
                episode_rewards.append(episode_reward)
                break

        if episode % 100 == 0:
            save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards)

        if episode % 5 == 0:
            # 행동 비율 출력
            action_distribution = action_counts / action_counts.sum()
            print(f"Action distribution: {action_distribution}")            
            plot_rewards()

    print('Complete')
    plot_rewards(show_result=True)
    plt.ioff()
    plt.show()

except KeyboardInterrupt:
    print("Training interrupted. Saving the current model...")
    save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards)
    env.close()
    print("Model saved. Exiting.")

<Figure size 640x480 with 0 Axes>

In [10]:
episode_rewards

[-500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -415.0,
 -409.0,
 -495.0,
 -500.0,
 -397.0,
 -500.0,
 -452.0,
 -286.0,
 -488.0,
 -500.0,
 -324.0,
 -447.0,
 -353.0,
 -426.0,
 -465.0,
 -235.0,
 -487.0,
 -316.0,
 -331.0,
 -500.0,
 -165.0,
 -277.0,
 -283.0,
 -199.0,
 -267.0,
 -238.0,
 -161.0,
 -416.0,
 -449.0,
 -258.0,
 -245.0,
 -289.0,
 -270.0,
 -195.0,
 -224.0,
 -184.0,
 -383.0,
 -222.0,
 -231.0,
 -250.0,
 -155.0,
 -155.0,
 -203.0,
 -165.0,
 -185.0,
 -132.0,
 -226.0,
 -154.0,
 -189.0,
 -109.0,
 -176.0,
 -118.0,
 -109.0,
 -179.0,
 -101.0,
 -107.0,
 -201.0,
 -151.0,
 -109.0,
 -160.0,
 -260.0,
 -169.0,
 -191.0,
 -143.0,
 -167.0,
 -154.0,
 -146.0,
 -123.0,
 -234.0,
 -183.0,
 -138.0,
 -139.0,
 -132.0,
 -160.0,
 -182.0,
 