In [17]:
import torch
torch.cuda.empty_cache()
try:
    import torch_directml
    global device
    device = torch_directml.device()
except (NameError, ModuleNotFoundError):
    import intel_npu_acceleration_library
    from intel_npu_acceleration_library import compile

    device = torch.device("cpu")
except (NameError, ModuleNotFoundError):
    device = torch.device(
        "cuda"
        if torch.cuda.is_available()
        else "mps" if torch.backends.mps.is_available() else "cpu"
    )

In [14]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [18]:
import gymnasium as gym  # As a best practice, Gymnasium is usually imported as 'gym'
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import matplotlib
import matplotlib.pyplot as plt
from gymnasium.wrappers import RecordVideo
from collections import namedtuple, deque
from itertools import count
import ipywidgets as widgets
import warnings
import ale_py

warnings.filterwarnings('ignore')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

env = gym.make('ALE/SpaceInvaders-v5', obs_type='grayscale')
print(env.spec)

plt.ion() # matplotlib를 interactive mode로 설정 -> 그래프를 실시간으로 업데이트할 수 있도록 함
device

EnvSpec(id='ALE/SpaceInvaders-v5', entry_point='ale_py.env:AtariEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=None, order_enforce=True, disable_env_checker=False, kwargs={'game': 'space_invaders', 'obs_type': 'grayscale', 'repeat_action_probability': 0.25, 'full_action_space': False, 'frameskip': 4, 'max_num_frames_per_episode': 108000}, namespace='ALE', name='SpaceInvaders', version=5, additional_wrappers=(), vector_entry_point=None)


device(type='privateuseone', index=0)

In [19]:
def plot_rewards(show_result=False):
    plt.figure(1)
    rewards_t = torch.tensor(episode_rewards, dtype=torch.float)
    
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.plot(rewards_t.numpy(), label='Episode Reward')
    
    # 100 에피소드 이동 평균 계산 및 플롯
    if len(rewards_t) >= 100:
        means = rewards_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy(), label='100 Episode Avg', linestyle='--')
    
    plt.legend()
    plt.pause(0.001)

    if is_ipython:
        display.display(plt.gcf())
        display.clear_output(wait=True)


In [34]:
class DuelingDQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DuelingDQN, self).__init__()
        self.conv = nn.Sequential(
            # 매개변수 순서: 입력 데이터 채널 수(컬러면 3, 흑백이면 1), 출력 데이터 채널 수, ...)
            nn.Conv2d(n_observations[0], 32, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=2),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )

        # DuelingDQN 적용
        self.fc_value = nn.Sequential(
            nn.Linear(self.feature_size(n_observations), 512),
            nn.ReLU(),
            nn.Linear(512, 1)
            )
        
        self.fc_advantage = nn.Sequential(
            nn.Linear(self.feature_size(n_observations), 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)  # 행동 이점은 행동 공간 크기
        )

    def feature_size(self, input_shape):
        return self.conv(torch.zeros(1, *input_shape)).flatten().size(0)

    def forward(self, x):
        x = x / 255.0
        x = self.conv(x).flatten(start_dim=1)

        # 상태값과 행동 이점 계산
        value = self.fc_value(x)
        advantage = self.fc_advantage(x)

        # 최종 Q값 계산
        q_values = value + advantage - advantage.mean(1, keepdim=True)
        return q_values

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class PrioritizedReplayMemory:
    def __init__(self, capacity, alpha=0.6):
        self.capacity = capacity
        self.memory = deque([], maxlen=capacity)
        self.priorities = deque([], maxlen=capacity)
        self.alpha = alpha

    def push(self, *args):
        # 새로운 경험 추가
        max_priority = max(self.priorities, default=1.0) # 가장 높은 우선순위로 초기화
        self.memory.append(Transition(*args))
        self.priorities.append(np.float32(max(1e-5, max_priority)))

    def sample(self, batch_size, beta=0.4):
        """우선순위 기반 샘플링"""
        if len(self.memory) == 0:
            raise ValueError("Memory is empty")  # 메모리가 비어 있는 경우 예외 처리

        priorities = np.array([float(p) for p in self.priorities], dtype=np.float32)  # 1차원 배열로 변환

        if priorities.sum() == 0:
            raise ValueError("All priorities are zero.")  # 우선순위 합이 0인 경우 예외 처리

        # 우선순위를 확률로 변환
        probabilities = priorities ** self.alpha
        probabilities /= probabilities.sum()

        # 확률에 따라 샘플 선택
        indices = np.random.choice(len(self.memory), batch_size, p=probabilities)
        samples = [self.memory[idx] for idx in indices]

        # 중요도 가중치 계산
        total = len(self.memory)
        weights = (total * probabilities[indices]) ** (-beta)
        weights /= weights.max()  # 가중치 정규화
        weights = torch.tensor(weights, dtype=torch.float32, device=device)

        return samples, weights, indices


    def update_priorities(self, indices, priorities):
            # 우선순위 업데이트
            for idx, priority in zip(indices, priorities):
                self.priorities[idx] = max(1e-5, priority) # 최소값 보장

    def __len__(self):
        # 저장된 메모리의 길이 반환
        return len(self.memory)

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return

    samples, weights, indices = memory.sample(BATCH_SIZE, beta=0.4)
    batch = Transition(*zip(*samples))

    # 다음 상태 중 None이 아닌 것만 선택
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]).to(device)
    # 모든 상태를 (배치 크기, 채널 수, 높이, 너비) 형식으로 연결
    state_batch = torch.cat(batch.state).to(device)  # (BATCH_SIZE, 3, 210, 160)
    action_batch = torch.cat(batch.action).to(device)
    reward_batch = torch.cat(batch.reward).to(device) # 배치 처리를 최대한 병렬로 처리

    # Q값 계산
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # 다음 상태의 Q값 계산
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        # Double DQN으로 수정
        next_actions = policy_net(non_final_next_states).max(1).indices.unsqueeze(1)
        next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_actions).squeeze(1)

    # 기대 Q값 계산
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    td_errors = (state_action_values - expected_state_action_values.unsqueeze(1)).abs().detach()
    td_errors = td_errors.cpu().numpy()
    td_errors = np.maximum(td_errors, 1e-5)  # 최소값 보장
    memory.update_priorities(indices, td_errors)

    # Huber 손실 함수와 중요도 가중치 적용
    criterion = nn.SmoothL1Loss(reduction='none')
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
    loss = (weights * loss).mean() # 중요도 가중치 적용

    # 모델 최적화
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1

    state = state.to(device)

    # 초기 eps_threshold는 1.0이다. steps_done이 증가함에 따라 점진적으로 감소하여 결국 Q-value가 가장 큰 행동을 선택.
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1,1)

    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)


In [35]:
import torchvision.transforms as T

resize = T.Compose([
    T.ToPILImage(),
    T.Resize((84, 84)),  # 크기 조정
    T.ToTensor()         # 텐서 변환
])

def preprocess_state(state):
    state = torch.tensor(state, dtype=torch.float32, device=device)
    state = resize(state)
    state = state.unsqueeze(0)  # 차원 추가 grayscale (84, 84) -> (1, 1, 84, 84)으로 변경
    return state

In [36]:
def save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards, path="models"):
    if not os.path.exists(path):
        os.makedirs(path)
    save_path = os.path.join(path, f"policy_net_episode_{episode}.pth")
    torch.save({
        'episode': episode,
        'model_state_dict': policy_net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'steps_done': steps_done,  # 탐험 상태 저장
        'memory': list(memory.memory), # 경험 저장
        'priorities': list(memory.priorities), # 우선순위 저장
        'episode_rewards': episode_rewards
    }, save_path)
    print(f"Model saved to {save_path}")


def load_model(policy_net, optimizer, path):
    checkpoint = torch.load(path)

    # 모델 가중치와 옵티마이저 복원
    policy_net.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # 에피소드 및 탐험상태 복원
    episode = checkpoint['episode']
    steps_done = checkpoint.get('steps_done', 0)  # 기본값 0

    # 리플레이 메모리복원
    memory_data = checkpoint.get('memory', [])
    priority_data = checkpoint.get('priorities', [])
    if memory_data and priority_data:
        memory.memory = deque(memory_data, maxlen=memory.capacity)
        memory.priorities = deque(priority_data, maxlen=memory.capacity)
    
    # 에피소드별 보상 복원
    episode_rewards = checkpoint.get('episode_rewards', [])

    print(f"Model loaded from {path}, starting from episode {episode}, steps_done={steps_done}")
    return episode, steps_done, episode_rewards




In [37]:
# 학습 파라미터
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.1
EPS_DECAY = 10000 # 크게 할수록 캄험이 더 오래 지속 -> 새로운 더 많이 탐색
TAU = 0.005
LR = 1e-4
memory_capacity = 50000 # 크게 할수록 에이전트가 더 많은 경험을 저장

n_actions = env.action_space.n # 18

n_observations = (4, 84, 84)

# 입력 텐서 형식은 (배치 크기, 채널 수, 높이, 너비)
policy_net = DuelingDQN(n_observations, n_actions).to(device)
target_net = DuelingDQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR, amsgrad=True)
memory = PrioritizedReplayMemory(memory_capacity) # 10000

steps_done = 0

episode_rewards = []

if torch.cuda.is_available() or torch.backends.mps.is_available():
    num_episodes = 600
else:
    num_episodes = 50

In [41]:
# 저장된 모델에서 학습 재개
checkpoint_path = "models/policy_net_episode_4.pth"
start_episode, steps_done, episode_rewards = load_model(policy_net, optimizer, checkpoint_path)

Model loaded from models/policy_net_episode_4.pth, starting from episode 4, steps_done=0


In [42]:
try:
    for episode in range(num_episodes+1):
        # if episode % 100 == 0 and episode != 0:
        #     env = gym.make('ALE/Breakout-v5', obs_type='grayscale', render_mode='human')
        # else:
        #     env = gym.make('ALE/Breakout-v5', obs_type='grayscale')
            
        state, info = env.reset()
        state_stack = deque([preprocess_state(state)] * 4,maxlen=4)
        state = torch.cat(list(state_stack), dim=1)

        episode_reward = 0
        action_counts = np.zeros(env.action_space.n)    

        for t in count(): # 무한 반복
            if episode < 10:  # 초기 10 에피소드는 무조건 랜덤 행동
                action = torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
            else:
                action = select_action(state)

            action_counts[action] += 1

            # 행동을 선택하면, 해당 행동이 4개의 연속된 프레임 동안 환경에 반복적으로 적용
            # 프레임 스킵 동안 발생한 보상이 모두 합산되어 반환, 마지막 프레임의 상태와 종료 여부가 반환
            next_state, reward, terminated, truncated, _ = env.step(action.item())
            reward = torch.tensor([reward * 10], device=device)
            episode_reward += reward.item()
            done = terminated or truncated
            
            if not done:
                next_state = preprocess_state(next_state)
                state_stack.append(next_state)
                next_state = torch.cat(list(state_stack), dim=1)
            else:
                next_state = None

            memory.push(state, action, next_state, reward)
            state = next_state

            # policy network 학습 
            if t % 4 == 0:
                optimize_model()

            if t % 8 == 0:
                # 소프트 업데이트 적용 (target network 업데이트) -> 16 프레임마다 학습이 이루어짐
                target_net_state_dict = target_net.state_dict()
                policy_net_state_dict = policy_net.state_dict()
                for key in policy_net_state_dict:
                    target_net_state_dict[key] = policy_net_state_dict[key] * TAU + target_net_state_dict[key] * (1-TAU)
                target_net.load_state_dict(target_net_state_dict)

            if done:
                episode_rewards.append(episode_reward)
                break

        if episode % 100 == 0:
            save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards)
            
        if episode % 5 == 0:
            # 행동 비율 출력
            action_distribution = action_counts / action_counts.sum()
            print(f"Action distribution: {action_distribution}")            
            plot_rewards()

    print('Complete')
    plot_rewards(show_result=True)
    plt.ioff()
    plt.show()

except KeyboardInterrupt:
    print("Training interrupted. Saving the current model...")
    save_model(episode, policy_net, optimizer, steps_done, memory, episode_rewards)
    env.close()
    print("Model saved. Exiting.")

Training interrupted. Saving the current model...
Model saved to models/policy_net_episode_5.pth
Model saved. Exiting.


<Figure size 640x480 with 0 Axes>