In [11]:
import gymnasium as gym
import ale_py
gym.register_envs(ale_py)
from IPython import display
from gymnasium.wrappers import RecordVideo
import ipywidgets as widgets
import warnings
from itertools import count
import torch

from torch import nn
from torch.nn import functional as F
from torch import optim
from collections import deque, namedtuple
import random
from torchvision.transforms import ToTensor
import math
import numpy as np
import intel_npu_acceleration_library
from intel_npu_acceleration_library import compile
try:
    # import torch_directml
    global device
    device = torch_directml.device()
except (NameError, ModuleNotFoundError):
    import intel_npu_acceleration_library
    from intel_npu_acceleration_library import compile
    device = torch.device("cpu")
except (NameError, ModuleNotFoundError):
    device = torch.device(
            "cuda" if torch.cuda.is_available() else
            "mps" if torch.backends.mps.is_available() else
            "cpu"
        )

warnings.filterwarnings("ignore")
device


device(type='cpu')

In [19]:
env = gym.make('ALE/Skiing-v5', render_mode='rgb_array', obs_type='grayscale')
video_env = RecordVideo(env, video_folder="./videos", disable_logger=True)
done = False

state, info = video_env.reset() # state: (210, 160, 3)
print(state.shape)
# Initialize the environment and get its state
while not done:
    # convert state to tensor
    observation, reward, terminated, truncated, _ = video_env.step(video_env.action_space.sample())
    done = terminated or truncated

video_env.close()
# Render recording
widgets.Video.from_file(
    f"./videos/rl-video-episode-0.mp4", autoplay=False, loop=False, width=700
)


(210, 160)


Video(value=b'\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00\x00\x00\x08free...', autoplay='False',…

- obs: array(210, 160, 3)
    + obs dataset: array(n, 210, 160, 3)
- reward: float

설계: 에피소드를 플레이해 기억 버퍼에 저장, 4개씩 임의로 뽑아 상태로 활용

In [3]:
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))
# Transition이란 이름을 일종의 구조체로

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
class DQN(nn.Module):
    def __init__(self, n_actions):
        super().__init__()
        self.fc=nn.Sequential(
            nn.AdaptiveAvgPool2d(1), 
            nn.Flatten(),
            nn.Linear(64, n_actions)
            )
        self.conv = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(5),  # (42, 32)
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
        )
    def forward(self, x):
        conv0 = self.conv(x)
        conv0_result = self.fc(conv0)
        return conv0_result
    

In [5]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state, info = env.reset()
n_observations = state.shape

# try:
#     policy_net = intel_npu_acceleration_library.compile(DQN(n_actions), dtype=torch.float32)
#     target_net = intel_npu_acceleration_library.compile(DQN(n_actions), dtype=torch.float32)
# except NameError:
#     policy_net = DQN(n_actions).to(device)
#     target_net = DQN(n_actions).to(device)
policy_net = intel_npu_acceleration_library.compile(DQN(n_actions), dtype=torch.float32)
target_net = intel_npu_acceleration_library.compile(DQN(n_actions), dtype=torch.float32)
# 처음에는 파라미터가 완전히 같게 시작

target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(10000)


steps_done = 0

In [6]:
# module test
# convert state to tensor
test_state = []
test_action = [0, 3, 1, 3]
for a in test_action:
    test_state.append(env.step(a)[0])
test_state = np.array(test_state, dtype=np.float64) # (4, 210, 160, 3)
test_state = test_state.transpose(0, 3, 1, 2) # (4, 3, 210, 160)
test_state = torch.Tensor(test_state)
test_action = torch.tensor(test_action).unsqueeze(1)
test_state_value = policy_net(test_state).gather(1, test_action)
test_state_value


tensor([[-6.9235],
        [-2.8290],
        [ 6.6807],
        [-2.8290]], grad_fn=<GatherBackward0>)

In [7]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(
        -1.0 * steps_done / EPS_DECAY
    )
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1).indices.view(1, 1)
    else:
        return torch.tensor(
            [[env.action_space.sample()]], device=device, dtype=torch.long
        )
select_action(test_state[0])


tensor([[14]])

In [8]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE) # (BATCH_SIZE, 4)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    
    batch = Transition(*zip(*transitions)) # Transition(*zip(*(t1, t2,...))) == Transition(*zip((s, a, r, s), (s, a, r, s), ...))
    # == Transition(list[s], list[a], list[r], list[s'])
    # batch.next_state = (s'1, s'2, ...) => expected shape = (BATCH_SIZE, 3, 210, 160)

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    
    non_final_mask = torch.tensor(
        tuple(map(lambda s: -1 not in s, batch.next_state)),
        device=device,
        dtype=torch.bool,
    )
    non_final_next_states = torch.cat([s for s in batch.next_state if -1 not in s])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    # (BATCH_SIZE, 18).gather(1, actions) => 경험했던 각 상태에서 행동의 가치를 (BATCH_SIZE, 1) 크기의 텐서로 반환
    
    state_action_values = policy_net(state_batch).gather(1, action_batch) 
    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1).values
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = (
            target_net(non_final_next_states).max(1).values
        )
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

In [9]:
# Initialize the environment and get its state
state, info = env.reset() # state: (210, 160, 3)
# convert state to tensor
state = np.array(state).transpose(2, 0, 1) # (3, 210, 160)
state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) #(1, 3, 210, 160)
action = select_action(state)
action

tensor([[1]])

In [None]:
import torchvision.transforms as T
num_episodes = 10


for i_episode in range(num_episodes):
    # Initialize the environment and get its state
    state, info = env.reset() # state: (210, 160, 3)
    # convert state to tensor
    transform = T.Compose(T.Grayscale(), T.Resize((84, 84)))
    transform(state.transpose(2, 0, 1))
    state = torch.tensor(state.transpose(2, 0, 1), dtype=torch.float32, device=device).unsqueeze(0) # (210, 160, 3) -> (3, 210, 160) -> (1, 3, 210, 160)
    for t in count():
        action = select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device)
        done = terminated or truncated
        if terminated:
            print("terminated")
            next_state = np.ones(state.shape) * -1
        else:
            next_state = torch.tensor(
                observation.transpose(2, 0, 1), dtype=torch.float32, device=device
            ).unsqueeze(0) #(210, 160, 3) -> (3, 210, 160) -> (1, 3, 210, 160)
        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[
                key
            ] * TAU + target_net_state_dict[key] * (1 - TAU)
        target_net.load_state_dict(target_net_state_dict)
        print(f"learning {t}th state...")
        if done:
            print(f"episode {i_episode} completed!")
            break

print("Complete")

learning 0th state...
learning 1th state...
learning 2th state...
learning 3th state...
learning 4th state...
learning 5th state...
learning 6th state...
learning 7th state...
learning 8th state...
learning 9th state...
learning 10th state...
learning 11th state...
learning 12th state...
learning 13th state...
learning 14th state...
learning 15th state...
learning 16th state...
learning 17th state...
learning 18th state...
learning 19th state...
learning 20th state...
learning 21th state...
learning 22th state...
learning 23th state...
learning 24th state...
learning 25th state...
learning 26th state...
learning 27th state...
learning 28th state...
learning 29th state...
learning 30th state...
learning 31th state...
learning 32th state...
learning 33th state...
learning 34th state...
learning 35th state...
learning 36th state...
learning 37th state...
learning 38th state...
learning 39th state...
learning 40th state...
learning 41th state...
learning 42th state...
learning 43th state..

KeyboardInterrupt: 

In [None]:
# Initialize environment
base_env = gym.make('ALE/IceHockey-v5', render_mode='rgb_array')
video_env = RecordVideo(base_env, video_folder="./videos", disable_logger=True)
done = False

# Initialize the environment and get its state
while not done:
    state, info = video_env.reset() # state: (210, 160, 3)
    # convert state to tensor
    state = torch.tensor(state.transpose(2, 0, 1), dtype=torch.float32, device=device).unsqueeze(0) # (210, 160, 3) -> (3, 210, 160) -> (1, 3, 210, 160)
    action = select_action(state)
    observation, reward, terminated, truncated, _ = video_env.step(action.item())
    done = terminated or truncated


# Render recording
widgets.Video.from_file(
    f"./videos/rl-video-episode-0.mp4", autoplay=False, loop=False, width=700
)
