In [1]:
import gym
import matplotlib.pyplot as plt
import torch
from wrappers import wrap_deepmind

In [2]:
from collections import namedtuple
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward','ended'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
memory = ReplayMemory(5000)
# plt.imshow(memory.memory[0].state[:3,].permute(1,2,0))

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def n2t(vec):
    return torch.from_numpy(vec).to(device)
    
def t2n(tensor):
    return tensor.cpu().numpy()

In [4]:
def play_game(env = wrap_deepmind(gym.make("Pong-v0"), frame_stack = True), agent = None, th = 0, maxstep = 5000, render = False):
    cum_reward = 0.0
    render_frames = []
    state = env.reset()
    

    for i in range(maxstep):
        # take action:
        action = agent(state, th = th)
        next_state, reward, ended, info = env.step(action)
        
        cum_reward += float(reward)
        
        # push to replay buffer:
        memory.push(state, action, next_state, reward, ended)
        state = next_state
        
        if render:
            if i % 3 == 0:
                render_frames.append(torch.from_numpy(env.render(mode="rgb_array")).unsqueeze(0))
        if ended == 1:
            break
            
    out = {'cum_reward' : cum_reward, 'steps' :  i}
    if render:
        out['frames'] = torch.cat(render_frames).permute(3,0,1,2).unsqueeze(0)
    return out

In [5]:
param = {'env' : 'Pong-v0',
         'batch_size' : 16,
        'GAMMA' : 0.7}

## Train model

### Agents

In [6]:
import random
def random_agent(state, th = None):
    return random.randint(a=0,b=env.action_space.n-1)

def dqn_epsilon_agent(state, th = 0.05):
    if random.random() > th:
        yhat = model(default_states_preprocessor(state))
        return int(yhat.argmax().cpu().numpy())
    else:
        return env.action_space.sample()

### Model

In [7]:
from torch import nn
import torch.nn.functional as F
from importlib import reload 
import model
from torch import optim
import numpy as np
reload(model)

<module 'model' from '/Users/simeide/Sync/edm/RL/atari/model.py'>

### Train script

In [8]:
def default_states_preprocessor(states):
    """
    Convert list of states into the form suitable for model. By default we assume Variable
    :param states: list of numpy arrays with states
    :return: Variable
    
    Obtained from https://github.com/Shmuma/ptan/blob/master/ptan/agent.py
    """
    
    if not isinstance(states,list):
        states = [states]
    
    if len(states) == 1:
        np_states = np.expand_dims(states[0], 0)
    else:
        np_states = np.array([np.array(s, copy=False) for s in states], copy=False)
    return torch.tensor(np_states).permute(0,3,1,2).float().to(device)


def train_batch(param):
    if len(memory) < param['batch_size']:
        return 0
    batch = memory.sample(param['batch_size'])
    batch_states = default_states_preprocessor([m.state for m in batch])
    batch_next_states = default_states_preprocessor([m.next_state for m in batch])
    batch_ended = torch.tensor([m.ended for m in batch])
    batch_rewards = torch.tensor([m.reward for m in batch])
    batch_actions = torch.tensor([m.action for m in batch])

    ## Calculate expected reward:
    with torch.set_grad_enabled(False):
        not_ended_batch = 1 -torch.ByteTensor(batch_ended)
        next_states_non_final = batch_next_states[not_ended_batch]
        next_state_values = torch.zeros(param['batch_size']).to(device)
        reward_hat = model(next_states_non_final)
        next_state_values[not_ended_batch] = reward_hat.max(1)[0]
        expected_state_action_values = next_state_values*param['GAMMA'] + batch_rewards

    # Predict value function:
    yhat = model(batch_states)
    state_action_values = yhat.gather(1, batch_actions.unsqueeze(1)).squeeze()

    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
    optimizer.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.data.clamp_(-1, 1)
    optimizer.step()
    return loss.data

In [9]:
from tensorboardX import SummaryWriter
import datetime

version = ", ".join([ "{}:{}".format(key,val) for key, val in param.items()]) + " "+str(datetime.datetime.now())[:16]
writer = SummaryWriter(log_dir = "tensorboard/" + version)

In [None]:
env = wrap_deepmind(gym.make(param['env']), frame_stack = True)
model = model.DQN(num_actions = env.action_space.n).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.0001) # , weight_decay = 0.001

# Warmup buffer
for _ in range(5):
    game = play_game(env, agent = dqn_epsilon_agent, th = 0.5)

step = 0
loss, rewards, episode_steps = {}, {}, {}
for episode in range(10000):
    
    ## PLAY GAME
    game = play_game(env, agent = dqn_epsilon_agent, th = 0.5)
    rewards['run_reward'], episode_steps['run_episode_steps'] = game['cum_reward'], game['steps']
    step += episode_steps['run_episode_steps']
    
    ## TRAIN
    for _ in range(episode_steps['run_episode_steps']//param['batch_size']):
        loss['run_loss'] = train_batch(param)
    
    
    # Test agent:
    if episode % 10 == 0:
        game = play_game(env, agent = dqn_epsilon_agent, th = 0.05)
        rewards['test_reward'], episode_steps['test_episode_steps'] = game['cum_reward'], game['steps']
    
    
    # REPORTING
    if episode % 5 == 0:
        writer.add_scalars("loss", tag_scalar_dict=loss, global_step= step)
        writer.add_scalars("rewards", rewards, step)
        writer.add_scalar("episode", episode, global_step = step)
        
    # Animate agent:
    if episode % 100 == 0:
        game = play_game(env, agent = dqn_epsilon_agent, th = 0.02, render = True)
        writer.add_video("test_game", game['frames'], global_step = step)


[MoviePy] Building file /var/folders/qq/j7fg85k93z59k9lkx9867qm4yy4xsx/T/tmp8z53m2ar.gif with imageio


100%|█████████▉| 339/340 [00:00<00:00, 347.87it/s]
