<h4><font color='red'>Work in Progress</font></h4>

# DQN - Breakout

#### Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions
import torchvision
from torch.autograd import Variable
from collections import deque, namedtuple
import random
import pickle
import gym
import os
from gym import wrappers
import io
import base64
from IPython.display import HTML

#### Global parameters

In [2]:
CUDA = torch.cuda.is_available()
print('CUDA has been enabled.' if CUDA is True else 'CUDA has been disabled.')

BATCH_SIZE = 32

FloatTensor = torch.cuda.FloatTensor if CUDA else torch.FloatTensor
IntTensor   = torch.cuda.IntTensor if CUDA else torch.IntTensor
LongTensor  = torch.cuda.LongTensor if CUDA else torch.LongTensor
ByteTensor  = torch.cuda.ByteTensor if CUDA else torch.ByteTensor
Tensor      = FloatTensor

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

ENV = 'BreakoutDeterministic-v4'
print(f'\nRunning the {ENV} environment...')

env = gym.make(ENV)
N_OBS_SPACE = env.observation_space.shape[0]
N_ACT_SPACE = env.action_space.n
print(f'State represented by {N_OBS_SPACE}-dimensional vector and {N_ACT_SPACE} actions available.')
#del env

CUDA has been enabled.

Running the BreakoutDeterministic-v4 environment...
State represented by 210-dimensional vector and 4 actions available.


#### Save/Load

In [3]:
def save_agent(agent, filename):
    '''
    Saves an agent of specified filename to relative path.
    '''
    path = os.getcwd() + '/agents'
    if not os.path.exists(path):
        os.makedirs(path)
    with open(path + '/' + filename + '.agent', 'wb') as f:
        pickle.dump(agent, f)
    save_stats(agent.test_n, agent.test_r, filename)
    

def load_agent(filename):
    '''
    Loads an agent of specified filename from relative path.
    '''
    with open(os.getcwd() + '/agents' + '/' + filename + '.agent', 'rb') as f:
        return pickle.load(f)
    
    
def save_stats(n, r, filename):
    '''
    Saves stats of specified filename to relative path.
    '''
    path = os.getcwd() + '/agents'
    if not os.path.exists(path):
        os.makedirs(path)
    with open(path + '/' + filename + '.stats', 'wb') as f:
        pickle.dump((n, r), f)

        
def load_stats(filename):
    '''
    Loads stats of specified filename from relative path.
    '''
    with open(os.getcwd() + '/agents' + '/' + filename + '.stats', 'rb') as f:
        return pickle.load(f)

    
def load_memory(filename='human'):
    '''
    Loads human play memory from relative path.
    '''
    with open(os.getcwd() + '/memory/' + filename + '.memory', 'rb') as f:
        return pickle.load(f)

#### Agent base

In [4]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

    
class Agent(object):
    def __init__(self,
                 policy=None,
                 env=gym.make(ENV),
                 num_episodes=1000,
                 discount_factor=0.99,
                 lr=1e-3,
                 test_freq=200,
                 test_num=10,
                 min_reward=-250,
                 max_reward=200):
        super(Agent, self).__init__()
        
        self.env = env
        self.num_episodes = num_episodes
        self.discount_factor = discount_factor
        self.lr = lr
        self.test_freq = test_freq
        self.test_num = test_num
        self.min_reward = min_reward
        self.max_reward = max_reward
        self.achieved_max_reward = False
        self.rollout_limit = env.spec.timestep_limit
        
        if policy is not None:
            self.policy = policy.cuda() if CUDA else policy
            self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        
        self.test_n, self.test_r = [], []
        self.losses = []
           
    
    def reset_env(self, env=None):
        """
        Resets the current environment using a constant
        seed to make sure environment is deterministic.
        """
        if env is None: env = self.env
        env.seed(0)
        return self.preprocess(env.reset())
    
    
    def select_action(self, s):
        """
        Selects an action according to the current policy.
        """
        s = Variable(Tensor(s))
        action_probs = self.policy(s)
        log_probs = action_probs.log()
        action = torch.distributions.Categorical(action_probs).sample()
        return action.data.cpu().numpy(), log_probs[action]
    
    
    def take_action(self,state):
        preprocessedstate = self.preprocess(state)
        action = self.select_action(preprocessedstate)
        return action[0]
    
    
    def preprocess(self, x):
        x = torch.tensor(x).permute([2, 0, 1]).data.numpy()
        x = np.mean(x[:, ::2, ::2], axis=0) / 255
        return x.reshape(-1, 1, 105, 80)
    
    
    def play_episode(self, env=None, replay=False):
        """
        Plays a single episode and returns SAR rollout.
        The logarithm of the action probabilities is also
        included in the rollout (for computing the loss).
        """
        train = env is None
        if train:
            env = self.env
            
        s = self.reset_env(env)
        rollout, eps_r = [], 0

        for i in range(self.rollout_limit):
            a, log_probs = self.select_action(s)
            s1, r, done, _ = env.step(a)
            s1 = self.preprocess(s1)          
            eps_r += r
            
            if done:
                r = -1
                
            rollout.append((s, a, r, log_probs))
            
            if train:            
                if self.epsilon > self.epsilon_min:
                    self.epsilon -= (self.epsilon_max - self.epsilon_min) / self.epsilon_steps
                    
                if hasattr(self, 'memory'):
                    self.memory.push(Tensor(s), a, Tensor(s1), r)
                    self.step_num += 1
                    if self.step_num % self.update_target == 0:
                        self.step_num = 0
                        self.target.load_state_dict(self.policy.state_dict())
                        if CUDA:
                            self.target = self.target.cuda()
                        
                if CUDA:
                    self.target = self.target.cuda()
                    
                if replay: self.replay()
                if eps_r < self.min_reward and env is None: break
                    
            if done: break
                     
            s = s1

        if eps_r > self.max_reward:
            print('Achieved maximum reward:', eps_r)
            self.achieved_max_reward = True
            
        return np.array(rollout)
    
    
    def test(self):
        """
        Runs a number of tests and computes the
        mean episode length and mean reward.
        """
        n, r = [], []

        for e in range(self.test_num):
            rollout = self.play_episode()     
            rewards = np.array(rollout[:,2], dtype=float)
            n.append(len(rollout))
            r.append(sum(rewards))

        self.test_n.append(n)
        self.test_r.append(r)
        return np.mean(n), np.mean(r)
 

    def get_replay(self):
        """
        Renders an episode replay using the current policy.
        """
        env = wrappers.Monitor(self.env, "./gym-results", force=True)
        state = env.reset()
        while True:
            env.render()
            action = agent.take_action(state)
            state_next, reward, terminal, info = env.step(action)
            state = state_next
            if terminal: break

        env.close()
            
    
    def render(self):
        if hasattr(self, 'epsilon'):
            eps = self.epsilon
            self.epsilon = 0
            self.play_episode(self.env, render=True)
            self.epsilon = eps
        else:
            self.play_episode(self.env, render=True)
        

#### DQN

In [5]:
class Conv(nn.Module):

    def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1, stride=1, padding=False):
        super(Conv, self).__init__()

        padding = int((kernel_size - 1) / 2) if padding else 0
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, dilation=dilation, stride=stride, padding=padding),
            nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )

    def forward(self, x):
        return self.conv(x)


class PolicyConvNet(nn.Module):

    def __init__(self, n_out):
        super(PolicyConvNet, self).__init__()

        self.conv1 = Conv(1, 16, kernel_size=8, stride=4)
        self.conv2 = Conv(16, 32, kernel_size=4, stride=2)

        self.fc1 = nn.Linear(2816, 128)
        self.out = nn.Linear(128, n_out)


    def forward(self, x):
        x = x.reshape(-1, 1, 105, 80)
        x = self.conv1(x)
        x = self.conv2(x)
        
        x = x.reshape(-1, 2816)
        x = F.relu(self.fc1(x))
        return self.out(x)


class DQN(Agent):
    def __init__(self,
                 env_arg='BreakoutDeterministic-v4',
                 env=gym.make('BreakoutDeterministic-v4'),
                 num_episodes=1000,
                 discount_factor=0.99,
                 lr=3e-4,
                 test_freq=50,
                 test_num=10,
                 update_target=2500,
                 epsilon_max=1,
                 epsilon_min=0.1,
                 epsilon_steps=1000000//4,
                 name=None):
        super(DQN, self).__init__(
            env=gym.make(env_arg),
            policy=PolicyConvNet(n_out=env.action_space.n),
            num_episodes=num_episodes,
            discount_factor=discount_factor,
            lr=lr,
            test_freq=test_freq,
            test_num=test_num)
        
        self.inputsize = env.observation_space.shape[0]
        self.outputsize = env.action_space.n

        if CUDA:
            self.policy = self.policy.cuda()
            
        self.target = PolicyConvNet(n_out=env.action_space.n)
        self.target.load_state_dict(self.policy.state_dict())
        self.target.eval()
        
        if CUDA:
            self.target = self.target.cuda()

        self.loss = nn.MSELoss()
        self.memory = ReplayMemory(100000)
        self.update_target = update_target
        self.epsilon_max = epsilon_max
        self.epsilon = epsilon_max
        self.epsilon_min = epsilon_min
        self.epsilon_steps = epsilon_steps
        self.step_num = 0
        
        
    def select_action(self, s):
        """
        Selects an action according to the current policy.
        """
        if random.random() > self.epsilon:
            with torch.no_grad():
                return int(self.policy(Tensor(s)).argmax().cpu().data.numpy()), None
        else:
            return np.random.randint(self.outputsize), None
    
    
    def update_weights(self, transitions):
        """
        Updates the network's weights according to a
        number of given transitions.
        """
        minibatch_size = len(transitions)
        
        batch = Transition(*zip(*transitions))
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), dtype=torch.uint8)

        non_final_next_states = torch.cat([Tensor(s) for s in batch.next_state
                                                    if s is not None])
        
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(torch.split(LongTensor(batch.action), 1)).reshape(minibatch_size, -1)
        reward_batch = Tensor(batch.reward)

        state_action_values = self.policy(state_batch).gather(1, action_batch)

        next_state_values = Tensor(torch.zeros(minibatch_size).cuda() if CUDA else torch.zeros(minibatch_size))
        
        next_state_values[non_final_mask] = self.target(non_final_next_states).max(1)[0].detach()
        expected_state_action_values = ((next_state_values * self.discount_factor) + reward_batch)
        
        loss = self.loss(state_action_values, expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
    
    
    def replay(self):
        """
        Samples a number of transitions from memory and replays
        the experiencies. Does not do anything until memory contains
        enough samples for a full minibatch.
        """
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        self.update_weights(transitions)
    
    
    def pretrain(self, batches=10000, batch_size=32):
        print('Pretraining from memory...')
        
        self.memory = load_memory()
        
        for batch in range(1, batches):
            transitions = self.memory.sample(batch_size)
            self.update_weights(transitions)
            
            if batch % (batches // 5) == 0:
                print(str(int(batch*100//batches)) + '%')
                
        print('Completed pretraining!\n')
        
        
    def fill_memory(self, episodes=300):
        env = self.env     
        s = self.reset_env(env)
        
        for episode in range(episodes):
            for i in range(self.rollout_limit):
                a = np.random.randint(self.outputsize)
                s1, r, done, _ = env.step(a)
                s1 = self.preprocess(s1)

                if done:
                    r = -1

                self.memory.push(Tensor(s), a, Tensor(s1), r)    
                if done: break
                s = s1
    
    
    def train(self):
        """
        Runs a full training for a defined number of episodes.
        """
        print('Filling memory...')
        self.fill_memory()
        print('Started training...')
        
        for e in range(1, self.num_episodes+1):
            rollout = self.play_episode(replay=True)
            
            if e % self.test_freq == 0:
                n, r = self.test()
                print('{:5d},  Reward: {:6.2f},  Length: {:4.2f}, Epsilon: {:4.2f}'.format(e, r, n, self.epsilon))
                
            if self.achieved_max_reward: break

        print('Completed training!')
        

agent = DQN() #load_agent('ConvDQN-solved')
#agent.pretrain()
agent.train()
save_agent(agent, 'ConvDQN-solved')

Filling memory...
Started training...


KeyboardInterrupt: 

In [None]:
agent.get_replay()

#### Load stats

In [None]:
env = wrappers.Monitor(env, "./gym-results", force=True)
state = env.reset()
while True:
    env.render()
    action = agent.take_action(state)
    state_next, reward, terminal, info = env.step(action)
    state = state_next
    if terminal:
        break

    # take a random action
env.close()

In [None]:
print(agent.take_action(state))

#### Load human memory

In [None]:
mem = load_memory()

print('Human memory of length', len(mem.memory))


import matplotlib.pyplot as plt
plt.imshow(mem.memory[0].state.reshape(105, 80).cpu().numpy())