# Neural networks
Classes for DQN and Dueling DQN.

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from random import random, randrange


"""
DEEP Q-NETWORK
"""
class DQN(nn.Module):
    
    def __init__(self, obs_shape, n_outputs, device):
        super(DQN, self).__init__()

        self.ch, self.w, self.h = obs_shape
        self.n_outputs = n_outputs
        self.device = device
        
        # Convolution layers
        self.conv = nn.Sequential(
            nn.Conv2d(
                in_channels=self.ch, 
                out_channels=32,
                kernel_size=8,
                stride=4
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=32, 
                out_channels=64,
                kernel_size=4,
                stride=2
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=64, 
                out_channels=64,
                kernel_size=3,
                stride=1
            ),
            nn.ReLU()
        )
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(self.conv_out, 512),
            nn.ReLU(),
            nn.Linear(512, n_outputs)
        )
        
    @property
    def conv_out(self):
        x = self.conv(torch.zeros(1, self.ch, self.w, self.h))
        return x.view(1, -1).size(1)
        
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1)
        x = self.fc(x)
        return x
    
    def get_action(self, obs, eps=0):
        # Epsilon-greedy policy
        if random() < eps:
            action = randrange(self.n_outputs)
        else:
            obs = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            with torch.no_grad():
                q_val = self.forward(obs)
                action = q_val.max(1)[1].item()
        return action
    
    def loss(self, action_values, target_values):
        # Mean Squared Error
        # return F.mse_loss(action_values, target_values)
        # Huber loss
        return F.smooth_l1_loss(action_values, target_values)


"""
DUELING NETWORK
"""
class DuelingDQN(nn.Module):
    
    def __init__(self, obs_shape, n_outputs, device):
        super(DuelingDQN, self).__init__()

        self.ch, self.w, self.h = obs_shape
        self.n_outputs = n_outputs
        self.device = device
        
        # Convolution layers:
        self.conv = nn.Sequential(
            nn.Conv2d(
                in_channels=self.ch, 
                out_channels=32,
                kernel_size=8,
                stride=4
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=32, 
                out_channels=64,
                kernel_size=4,
                stride=2
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=64, 
                out_channels=64,
                kernel_size=3,
                stride=1
            ),
            nn.ReLU()
        )
        
        # State value function
        self.fc_V = nn.Sequential(
            nn.Linear(self.conv_out, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
        # Action advantage function
        self.fc_A = nn.Sequential(
            nn.Linear(self.conv_out, 512),
            nn.ReLU(),
            nn.Linear(512, n_outputs)
        )
        
    @property
    def conv_out(self):
        x = self.conv(torch.zeros(1, self.ch, self.w, self.h))
        return x.view(1, -1).size(1)
        
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1)
        adv = self.fc_A(x)
        val = self.fc_V(x).expand(x.size(0), self.n_outputs)
        x = val + adv - adv.mean(1).unsqueeze(1).expand(x.size(0), self.n_outputs)
        return x
    
    def get_action(self, obs, eps=0):
        # Epsilon-greedy policy
        if random() < eps:
            action = randrange(self.n_outputs)
        else:
            obs = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            q_val = self.forward(obs)
            action = q_val.max(1)[1].item()
        return action
    
    def loss(self, action_values, target_values):
        # Mean Squared Error
        # return F.mse_loss(action_values, target_values)
        # Huber loss
        return F.smooth_l1_loss(action_values, target_values)


# Experience replay
Classes for regular experience replay and prioritised experience replay.

In [0]:
from collections import deque
import numpy as np


"""
EXPERIENCE REPLAY
"""
class ExperienceReplay:
    
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)
    
    def add(self, obs, reward, action, obs2, done):
        self.buffer.append((obs, reward, action, obs2, done))
        
    def sample_minibatch(self, batch_size):
        mb_idxs = np.random.choice(len(self.buffer), size=batch_size, replace=False)
        
        mb = zip(*[self.buffer[i] for i in mb_idxs])
        obs, reward, action, obs2, done = mb
        return (
            np.array(obs), 
            np.array(reward, dtype=np.float32), 
            np.array(action), 
            np.array(obs2), 
            np.array(done, dtype=np.uint8)
        )
        
    def __len__(self):
        return len(self.buffer)


"""
PRIORITIZED EXPERIENCE REPLAY
"""
class PER:
    
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)
        self.priorities = deque(maxlen=buffer_size)
    
    def add(self, obs, reward, action, obs2, done):
        self.buffer.append((obs, reward, action, obs2, done))
        self.priorities.append(max(self.priorities, default=1))

    def get_probabilities(self, priority_scale):
        scaled_priorities = np.array(self.priorities) ** priority_scale
        batch_probabilities = scaled_priorities / sum(scaled_priorities)
        return batch_probabilities

    def get_importance(self, probabilities):
        importance = 1/len(self.buffer) * 1/probabilities
        normalized_importance = importance / max(importance)
        return normalized_importance
        
    def sample_minibatch(self, batch_size, priority_scale=1):
        probabilities = self.get_probabilities(priority_scale)

        mb_idxs = np.random.choice(len(self.buffer), size=batch_size, replace=False, p=probabilities)
        
        mb = zip(*[self.buffer[i] for i in mb_idxs])

        importance = self.get_importance(probabilities[mb_idxs])

        obs, reward, action, obs2, done = mb
        return (
            np.array(obs), 
            np.array(reward, dtype=np.float32), 
            np.array(action), 
            np.array(obs2), 
            np.array(done, dtype=np.uint8),
            importance,
            mb_idxs
        )
    
    def set_priorities(self, idxs, errors, offset=0.1):
        for idx, err in zip(idxs, errors):
            self.priorities[idx] = abs(err) + offset
        
    def __len__(self):
        return len(self.buffer)


# DQN
To view TensorBoard graphs run 


```
tensorboard --logdir=runs
```
from the directory where the runs folder is created, and go to localhost:6006 in your web browser.


In [12]:
import gym
import torch
import torch.optim as optim

from gym.wrappers import AtariPreprocessing, FrameStack
from torch import FloatTensor, LongTensor
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime


def optimize(eps):
    # Sample minibatch
    if PRIORITIZED_REPLAY:
        mb_obs, mb_reward, mb_action, mb_obs2, mb_done, importance, idxs = memory.sample_minibatch(BATCH_SIZE, 0.7)
    else:
        mb_obs, mb_reward, mb_action, mb_obs2, mb_done = memory.sample_minibatch(BATCH_SIZE)
    mb_obs = Variable(FloatTensor(mb_obs)).to(device)
    mb_reward = Variable(FloatTensor(mb_reward)).to(device)
    mb_action = Variable(LongTensor(mb_action)).to(device)
    mb_obs2 = Variable(FloatTensor(mb_obs2)).to(device)
    mb_done = Variable(FloatTensor(mb_done)).to(device)

    # Calculate action values
    qv = policy_net.forward(mb_obs).gather(1, mb_action.unsqueeze(1)).squeeze()

    # Calculate target values
    if DDQN:
        mb_target_qv = target_net.forward(mb_obs2).gather(1, policy_net.forward(mb_obs2).max(1)[1].unsqueeze(1)).squeeze()
    else:
        mb_target_qv = target_net.forward(mb_obs2).max(1)[0]
    
    # Calculate expected values
    e_qv = mb_reward + DISCOUNT * mb_target_qv * (1 - mb_done)

    if PRIORITIZED_REPLAY:
        # Calculate TD-errors
        errors = (e_qv - qv).cpu().detach().numpy()

        # Update transition priorities
        memory.set_priorities(idxs, errors)

    # Calculate loss
    loss = (FloatTensor(importance**(1 - eps)).to(device) * policy_net.loss(qv, e_qv)).mean() if PRIORITIZED_REPLAY else policy_net.loss(qv, e_qv)

    # Update gradient
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss.item()


def train():
    # Initialize environment
    step_count = 0
    running_loss = 0
    batch_reward = []
    eps = EPSILON_START
    decay = (EPSILON_START - EPSILON_END) / EPSILON_DECAY

    # Run episodes
    for episode in range(EPISODES):
        print('\rEpisode', episode, end='', flush=True)
        
        obs = env.reset()
        game_reward = 0
        done = False

        while not done:
            # Retrieve action from epsilon-greedy policy
            action = policy_net.get_action(obs, eps)
            new_obs, reward, done, _ = env.step(action)

            # Store transition in replay memory
            memory.add(obs, reward, action, new_obs, done)

            obs = new_obs
            game_reward += reward
            step_count += 1

            # Epsilon decay
            if eps > EPSILON_END:
                eps -= decay

            # Optimize policy network
            if len(memory) > MIN_MEMORY_SIZE:
                running_loss += optimize(eps)
                
                # Update TensorBoard loss graph
                if step_count % 1000 == 0:
                    writer.add_scalar('training loss', running_loss/1000, step_count)
                    running_loss = 0

                # Update target network
                if step_count % UPDATE_TARGET == 0:
                    target_net.load_state_dict(policy_net.state_dict())
        
        # Update TensorBoard train reward graph
        writer.add_scalar('training reward', game_reward, step_count)

        # Test target network
        if episode % TEST_FREQUENCY == 0:
            mean_test_reward = test()
            print(f'\n = | Episode: {episode:4d} | Step: {step_count:7d} | Reward: {mean_test_reward:2.2f} |\n')
            
            # Update Tensorboard test reward graph
            writer.add_scalar('test reward', mean_test_reward, step_count)

            # Save target network parameters to file
            torch.save(target_net.state_dict(), f'{log_path}/target_net_params.pt')


def test():
    print('\n\nTesting target network')

    games_reward = 0

    for episode in range(10):
        print('\r > Test episode', episode, end='', flush=True)

        obs = env.reset()
        done = False

        while not done:
            action = target_net.get_action(obs)
            obs, reward, done, _ = env.step(action)
            games_reward += reward
    
    return games_reward / 10


if __name__ == '__main__':

    # Hyperparameters
    ENV_NAME            =   'PongNoFrameskip-v4'
    RUN_TITLE           =   'PER_Dueling_DDQN'
    LEARNING_RATE       =   1e-4
    EPISODES            =   1000
    DISCOUNT            =   0.99
    BATCH_SIZE          =   32
    MEMORY_SIZE         =   int(1e5)
    MIN_MEMORY_SIZE     =   int(1e4)
    UPDATE_TARGET       =   int(1e3)
    FRAMES_NUMBER       =   4
    TEST_FREQUENCY      =   10
    EPSILON_START       =   1
    EPSILON_END         =   0.1
    EPSILON_DECAY       =   int(5e5)
    DDQN                =   True
    DUELING_DQN         =   True
    PRIORITIZED_REPLAY  =   True

    # GPU availability
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Path to log files
    init_time = datetime.isoformat(datetime.now()).replace(':', '-')[:-7]
    log_path = f'runs/{ENV_NAME}_{RUN_TITLE}_{init_time}'

    # TensorBoard summary writer
    writer = SummaryWriter(log_path)

    # Hyperparameter log file
    with open(f'{log_path}/hyperparams.txt', 'w') as file:
        file.write(f"{RUN_TITLE} Hyperparameters\n\n")
        file.write(f"Learning rate:\t\t {LEARNING_RATE}\n")
        file.write(f"Episodes:\t\t\t {EPISODES}\n")
        file.write(f"Discount:\t\t\t {DISCOUNT}\n")
        file.write(f"Batch size:\t\t\t {BATCH_SIZE}\n")
        file.write(f"Memory size:\t\t {MEMORY_SIZE}\n")
        file.write(f"Min. memory size:\t {MIN_MEMORY_SIZE}\n")
        file.write(f"Update target:\t\t {UPDATE_TARGET}\n")
        file.write(f"Frames num:\t\t\t {FRAMES_NUMBER}\n")
        file.write(f"Test Frequenzy:\t\t {TEST_FREQUENCY}\n")
        file.write(f"Start exploration:\t {EPSILON_START}\n")
        file.write(f"End exploration:\t {EPSILON_END}\n")
        file.write(f"Exploration steps:\t {EPSILON_DECAY}\n")
        file.write(f"Double DQN:\t\t\t {DDQN}\n")
        file.write(f"Dueling DQN:\t\t {DUELING_DQN}\n")
        file.write(f"Prioritized Replay:\t {PRIORITIZED_REPLAY}")

    # Environment
    env = FrameStack(AtariPreprocessing(gym.make(ENV_NAME)), FRAMES_NUMBER)

    # Dimensions of observations
    obs_dim = env.observation_space.shape

    # Amount of actions
    n_outputs = env.action_space.n
    
    # Neural networks
    policy_net = DuelingDQN(obs_dim, n_outputs, device).to(device) if DUELING_DQN \
            else DQN(obs_dim, n_outputs, device).to(device)
    target_net = DuelingDQN(obs_dim, n_outputs, device).to(device) if DUELING_DQN \
            else DQN(obs_dim, n_outputs, device).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    # Optimizer
    optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

    # Replay memory
    memory = PER(MEMORY_SIZE) if PRIORITIZED_REPLAY \
        else ExperienceReplay(MEMORY_SIZE)

    # Train policy network
    train()

    # Close environment
    env.close()


Episode 0

Testing target network
 > Test episode 9
 = | Episode:    0 | Step:     908 | Reward: -21.00 |

Episode 10

Testing target network
 > Test episode 9
 = | Episode:   10 | Step:    9649 | Reward: -21.00 |

Episode 20

Testing target network
 > Test episode 9
 = | Episode:   20 | Step:   18445 | Reward: -21.00 |

Episode 21

KeyboardInterrupt: ignored