In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym
from gym.wrappers import FrameStack
from torchvision import transforms as T
from gym.spaces import Box
import math

In [2]:
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, sigma_init=0.5):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.weight_mu = nn.Parameter(torch.empty(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features))
        self.register_buffer("weight_epsilon", torch.empty(out_features, in_features))

        self.bias_mu = nn.Parameter(torch.empty(out_features))
        self.bias_sigma = nn.Parameter(torch.empty(out_features))
        self.register_buffer("bias_epsilon", torch.empty(out_features))

        self.sigma_init = sigma_init
        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        mu_range = 1 / math.sqrt(self.in_features)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.sigma_init / math.sqrt(self.in_features))
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.sigma_init / math.sqrt(self.out_features))

    def reset_noise(self):
        self.weight_epsilon.normal_()
        self.bias_epsilon.normal_()

    def forward(self, x):
        if self.training:
            weight = self.weight_mu + self.weight_sigma * self.weight_epsilon
            bias = self.bias_mu + self.bias_sigma * self.bias_epsilon
        else:
            weight = self.weight_mu
            bias = self.bias_mu
        return F.linear(x, weight, bias)

In [3]:
class QNet(nn.Module):
    def __init__(self, n_actions):
        super(QNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        self.fc_val = nn.Sequential(
            NoisyLinear(3136, 512),
            nn.ReLU(),
            NoisyLinear(512, 1)
        )
        self.fc_adv = nn.Sequential(
            NoisyLinear(3136, 512),
            nn.ReLU(),
            NoisyLinear(512, n_actions)
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)
        val = self.fc_val(x)
        adv = self.fc_adv(x)
        q = val + (adv - adv.mean(dim=1, keepdim=True))
        return q

    def reset_noise(self):
        for m in self.modules():
            if isinstance(m, NoisyLinear):
                m.reset_noise()

class ReplayBuffer:
    def __init__(self, capacity, n_step=3, gamma=0.99, alpha=0.5):
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)
        
        self.gamma = gamma
        self.n_step = n_step
        self.n_step_buffer = deque(maxlen=n_step)
        
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)
        self.alpha = alpha
        
    def add(self, state, action, reward, next_state, done, episode_end=False):
        state = torch.tensor(np.array(state).copy(), dtype=torch.float32)
        next_state = torch.tensor(np.array(next_state).copy(), dtype=torch.float32)
        action = torch.tensor([action], dtype=torch.int64)
        reward = torch.tensor([reward], dtype=torch.float32)
        done = torch.tensor([done], dtype=torch.float32)
        
        self.n_step_buffer.append((state, action, reward, next_state, done))
        if len(self.n_step_buffer) < self.n_step:
            return
        
        state_n, action_n, reward_n, next_state_n, done_n = self._get_n_step_info()
        
        max_priority = self.priorities.max() if self.buffer else 1.0
        if len(self.buffer) < self.capacity:
            self.buffer.append((state_n, action_n, reward_n, next_state_n, done_n))
        else:
            self.buffer[self.pos] = (state_n, action_n, reward_n, next_state_n, done_n)
        
        self.priorities[self.pos] = max_priority
        self.pos = (self.pos + 1) % self.capacity
        
        if episode_end or done.item() == 1:
            while len(self.n_step_buffer) > 0:
                state_n, action_n, reward_n, next_state_n, done_n = self._get_n_step_info()
                
                if len(self.buffer) < self.capacity:
                    self.buffer.append((state_n, action_n, reward_n, next_state_n, done_n))
                else:
                    self.buffer[self.pos] = (state_n, action_n, reward_n, next_state_n, done_n)
                
                self.priorities[self.pos] = max_priority
                self.pos = (self.pos + 1) % self.capacity
                
                self.n_step_buffer.popleft()
            
            self.n_step_buffer.clear()
    
    def _get_n_step_info(self):
        reward, next_state, done = 0.0, None, None
        for idx, (_, _, r, next_s, d) in enumerate(self.n_step_buffer):
            reward += (self.gamma ** idx) * r
            next_state = next_s
            done = d
            if d.item() == 1:
                break
        state, action, _, _, _ = self.n_step_buffer[0]
        return state, action, reward, next_state, done
    
    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            probs = self.priorities
        else:
            probs = self.priorities[:self.pos]
        
        probs = probs ** self.alpha
        probs /= probs.sum()
        
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights = torch.tensor(weights, dtype=torch.float32)
        
        states, actions, rewards, next_states, dones = map(torch.stack, zip(*samples))
        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            self.priorities[idx] = priority

class DQNVariant:
    def __init__(self, action_size):
        self.action_size = action_size
        self.gamma = 0.99
        self.batch_size = 64
        self.learn_start = 10000
        # self.target_update_freq = 1000
        self.update_count = 0
        self.tau = 0.005
        
        # self.epsilon = 0.2
        # self.eps_decay = 0.99999975
        # self.eps_min = 0.1
        
        self.testing = False

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.q_net = QNet(action_size).to(self.device)
        self.target_net = QNet(action_size).to(self.device)
        self.update(learning=1.0)
        for p in self.target_net.parameters():
            p.requires_grad = False

        self.optimizer = optim.Adam(self.q_net.parameters(), lr=0.00025)
        self.replay_buffer = ReplayBuffer(100000)

    def get_action(self, state):
        deterministic = True
        if(not self.testing):
            self.q_net.reset_noise()
            # deterministic = random.random() > self.epsilon
            # self.epsilon *= self.eps_decay
            # self.epsilon = max(self.eps_min, self.epsilon)

        if(not deterministic): return np.random.randint(self.action_size)
        with torch.no_grad():
            state = torch.tensor(np.array(state).copy(), dtype=torch.float32).unsqueeze(0).to(self.device)
            q_values = self.q_net(state)
            action = torch.argmax(q_values).item()
        return action

    def update(self, learning):
        for target_param, param in zip(self.target_net.parameters(), self.q_net.parameters()):
            target_param.data.copy_(learning * param.data + (1 - learning) * target_param.data)

    def train(self):
        if len(self.replay_buffer.buffer) < self.learn_start:
            return
        
        beta = min(0.4 + (self.update_count / 2e6), 1.0)
        states, actions, rewards, next_states, dones, indices, weights = self.replay_buffer.sample(self.batch_size, beta)

        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)
        weights = weights.to(self.device)
        
        self.q_net.reset_noise()
        self.target_net.reset_noise()
        
        q_values = self.q_net(states).gather(1, actions)

        with torch.no_grad():
            next_actions = self.q_net(next_states).argmax(1, keepdim=True)
            next_q_values = self.target_net(next_states).gather(1, next_actions)
            target_q = rewards + (1 - dones) * self.gamma * next_q_values

        td_errors = (q_values - target_q).squeeze(1)
        loss = F.smooth_l1_loss(q_values, target_q, reduction='none').squeeze(1)
        loss = (weights.to(self.device) * loss).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.replay_buffer.update_priorities(indices, (td_errors.abs().cpu().detach().numpy() + 1e-6))
        
        self.update_count += 1
        self.update(learning=self.tau)

In [4]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self.skip = skip

    def step(self, action):
        total_reward = 0.0
        for i in range(self.skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done: break
        return obs, total_reward, done, info

# class GrayScaleObservation(gym.ObservationWrapper):
#     def __init__(self, env):
#         super().__init__(env)
#         obs_shape = self.observation_space.shape[:2]
#         self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

#     def permute_orientation(self, observation):
#         # permute [H, W, C] array to [C, H, W] tensor
#         observation = np.transpose(observation, (2, 0, 1))
#         observation = torch.tensor(observation.copy(), dtype=torch.float)
#         return observation

#     def observation(self, observation):
#         observation = self.permute_orientation(observation)
#         transform = T.Grayscale()
#         observation = transform(observation)
#         return observation


# class ResizeObservation(gym.ObservationWrapper):
#     def __init__(self, env, shape):
#         super().__init__(env)
#         if isinstance(shape, int):
#             self.shape = (shape, shape)
#         else:
#             self.shape = tuple(shape)

#         obs_shape = self.shape + self.observation_space.shape[2:]
#         self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

#     def observation(self, observation):
#         transforms = T.Compose(
#             [T.Resize(self.shape, antialias=True), T.Normalize(0, 255)]
#         )
#         observation = transforms(observation).squeeze(0)
#         return observation

class TransformObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape)
        self.observation_space = Box(
            low=0, high=255,
            shape=(1, *self.shape),
            dtype=np.uint8
        )
        self.transform = T.Compose([
            T.Grayscale(),
            T.Resize(self.shape, antialias=True),
            T.Normalize(0, 255)
        ])
        
    def observation(self, observation):
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        observation = self.transform(observation).squeeze(0)
        return observation

In [5]:
import time
import cv2
def test_agent(agent):
    sim_env = gym_super_mario_bros.make('SuperMarioBros-v0')
    sim_env = JoypadSpace(sim_env, COMPLEX_MOVEMENT)
    sim_env = SkipFrame(sim_env, skip=4)
    sim_env = TransformObservation(sim_env, shape=84)
    sim_env = FrameStack(sim_env, num_stack=4)
    
    agent.testing = True
    agent.q_net.eval()
    
    # agent.epsilon = 0.0
    
    state = sim_env.reset()
    done = False
    total_reward = 0
    step = 0

    while not done:
        
        action = agent.get_action(state)
        next_state, reward, done, _ = sim_env.step(action)
        
        state = next_state
        total_reward += reward
        step += 1
        sim_env.render()
        
        time.sleep(0.02)

    agent.testing = False
    agent.q_net.train()
    
    print(total_reward, step)
    sim_env.close()
    return

In [None]:
def train_agent():    
    # torch.autograd.set_detect_anomaly(True)
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, COMPLEX_MOVEMENT)
    env = SkipFrame(env, skip=4)
    env = TransformObservation(env, shape=84)
    env = FrameStack(env, num_stack=4)

    action_size = env.action_space.n
    state_size = env.observation_space.shape
    agent = DQNVariant(action_size)
    agent.testing = False
    agent.q_net.train()

    # def count_trainable_parameters(model):
    #     return sum(p.numel() for p in model.parameters() if p.requires_grad)

    # model = agent.q_net
    # print(f"Total Trainable Parameters: {count_trainable_parameters(model):,}")

    checkpoint = torch.load("4th/dqn_agent_1770.pth", map_location=agent.device)
    agent.q_net.load_state_dict(checkpoint['q_net'])
    agent.target_net.load_state_dict(checkpoint['target_net'])
    # agent.optimizer.load_state_dict(checkpoint['optimizer'])

    num_episodes = 3000
    reward_history = []
    total_frame = 0

    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        step = 0
        last_life = 2
        last_xpos = 0
        info = None

        while not done:
            
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            done = done or info["life"] < last_life
            clipped_reward = np.clip(reward, -1, 1)
            
            agent.replay_buffer.add(state, action, clipped_reward, next_state, done)
            agent.train()

            state = next_state
            total_reward += reward
            step += 1
            total_frame += 1
            if not done: last_xpos = info["x_pos"]
            
            # print(step)

        reward_history.append(total_reward)
        print(episode, total_reward, step, last_xpos)
        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}, Avg. Reward: {np.mean(reward_history[-10:])}")
            # test_agent(agent)
            torch.save({
                'q_net': agent.q_net.state_dict(),
                'target_net': agent.target_net.state_dict(),
                'optimizer': agent.optimizer.state_dict(),
            }, f"dqn_agent_{episode + 1}.pth")
            # for name, module in agent.q_net.named_modules():
            #     if isinstance(module, NoisyLinear):
            #         print(f"Layer: {name}")
            #         print(f"  weight_sigma mean: {module.weight_sigma.mean().item():.6f}")
            #         print(f"  bias_sigma mean:   {module.bias_sigma.mean().item():.6f}")

In [9]:
train_agent()

  checkpoint = torch.load("4th/dqn_agent_1770.pth", map_location=agent.device)


0 2773.0 2391 2227
1 1112.0 279 1222
2 797.0 225 895
3 2517.0 655 1624
4 1522.0 334 1642
5 2132.0 566 1219
6 2665.0 744 1779
7 1260.0 285 290
8 1373.0 340 1496
9 1277.0 360 1402
Episode 10, Avg. Reward: 1742.8
10 1388.0 323 1509
11 2053.0 531 1128
12 1652.0 380 1785
13 2521.0 632 1621
14 1102.0 322 1222
15 2120.0 596 1220
16 2124.0 586 1213
17 2323.0 888 1384
18 987.0 381 1117
19 714.0 355 837
Episode 20, Avg. Reward: 1698.4
20 1087.0 389 1218
21 220.0 44 284
22 582.0 160 669
23 221.0 45 285
24 671.0 2005 226
25 942.0 872 2699
26 202.0 51 268
27 706.0 203 802
28 633.0 171 722
29 2053.0 2300 1495
Episode 30, Avg. Reward: 731.7
30 982.0 471 1133
31 1535.0 367 1665
32 702.0 185 796
33 1478.0 404 1613
34 669.0 2001 226
35 646.0 2004 2628
36 455.0 2002 15
37 754.0 479 898
38 676.0 278 786
39 1103.0 311 1218
Episode 40, Avg. Reward: 900.0
40 615.0 189 706
41 726.0 177 815
42 482.0 2002 44
43 976.0 294 1087
44 1292.0 315 1409
45 456.0 2001 14
46 764.0 222 865
47 219.0 45 285
48 573.0 142 654


KeyboardInterrupt: 

In [None]:
agent = DQNVariant(12)

checkpoint = torch.load("4th/dqn_agent_1770.pth", map_location=agent.device)
agent.q_net.load_state_dict(checkpoint['q_net'])
agent.target_net.load_state_dict(checkpoint['target_net'])

test_agent(agent)

  checkpoint = torch.load("dqn_agent_10.pth", map_location=agent.device)


KeyboardInterrupt: 

In [None]:
# print(next_state.shape)
# cv2.imshow("Next State", np.array(next_state[0]))
# cv2.waitKey(10000)
# with open("dump.txt", "w+") as f:
#     torch.set_printoptions(threshold=float('inf'))
#     print(next_state[0], file=f)
# exit()

In [51]:
import cProfile
import pstats
import io

profiler = cProfile.Profile()
try:
    profiler.enable()
    train_agent()  # long-running code
    profiler.disable()
except KeyboardInterrupt:
    profiler.disable()
    print("Interrupted! Profiling results up to this point:")

s = io.StringIO()
ps = pstats.Stats(profiler, stream=s).sort_stats('cumtime')
ps.print_stats(20)
print(s.getvalue())

Total Trainable Parameters: 1,690,284
0 446.0 0.9958805064246858 4128
Interrupted! Profiling results up to this point:
         3390943 function calls (3267985 primitive calls) in 40.783 seconds

   Ordered by: cumulative time
   List reduced from 536 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.080    0.080   40.783   40.783 C:\Users\Danniel\AppData\Local\Temp\ipykernel_23600\1265275117.py:1(train_agent)
     4257    0.024    0.000   27.975    0.007 c:\Users\Danniel\anaconda3\envs\drl-hw3\lib\site-packages\gym\wrappers\frame_stack.py:116(step)
     4257    0.018    0.000   27.912    0.007 c:\Users\Danniel\anaconda3\envs\drl-hw3\lib\site-packages\gym\core.py:313(step)
     4257    0.039    0.000   24.259    0.006 C:\Users\Danniel\AppData\Local\Temp\ipykernel_23600\629983895.py:6(step)
    17027    0.020    0.000   24.221    0.001 c:\Users\Danniel\anaconda3\envs\drl-hw3\lib\site-packages\nes_py\wrappers\joypad_spac