In [1]:
# https://github.com/higgsfield/RL-Adventure/

In [2]:
%matplotlib inline

In [3]:
import yaml
import datetime
import os 
from IPython.display import clear_output
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [4]:
from torch.utils.tensorboard import SummaryWriter
# %tensorboard --port=9706 --logdir ./runs
from torchsummary import summary

In [5]:
seed_value = 324267 # sys.argv[1]

In [6]:
experiment = 'base'
# FROM CONFIG FILE
config_path =  './configs/' + experiment + '.yaml' # sys.argv[2]
config = yaml.safe_load(open(config_path,'r'))

# # Writer will output to ./runs/ directory by default
log_name = (config['EXP_NAME'] + 
                '_' + str(config['HIDDEN_LAYER_WIDTH']) + 
                '_' + config['REPLAY_BUFFER'] + 
                '_' + str(config['REPLAY_BUFFER_SIZE']) +
                '_' + 'freq'
                '_' + str(config['TARGET_UPDATE_FREQ']) +
                '_' + str(seed_value) + 
                '_' + datetime.datetime.now().strftime("%d%H%M%S"))

log_with_tensorboard = True
if log_with_tensorboard:
    WRITER_DIR = os.path.join(os.curdir,'runs',config['MODEL_NAME'],log_name)
    writer = SummaryWriter(log_dir=WRITER_DIR)
    print("EXPERIMENT: ", experiment, "\tSEED: ", seed_value, "\nLOG PATH: ", (WRITER_DIR))

EXPERIMENT:  base 	SEED:  324267 
LOG PATH:  ./runs/DuDQN/base_256_Naive_1000_freq_100_324267_12172415


In [7]:
# Print values in configuration YAML file
param_value = []
for each_param in config:
    param_value.append("".join([each_param, ': ', str(config[each_param])]))
pretty_output = "\n".join(param_value)
print(pretty_output)

EXP_NAME: base
USE_GPU: True
EPSILON_START: 1.0
EPSILON_FINAL: 0.01
EPSILON_DECAY: 500
HIDDEN_LAYER_WIDTH: 256
BATCH_SIZE: 32
TIMESTEPS: 100000
REPLAY_BUFFER: Naive
REPLAY_BUFFER_SIZE: 1000
TARGET_UPDATE_FREQ: 100
GAMMA: 0.99
MODEL_NAME: DuDQN
OPTIMIZER: Adam
LEARNING_RATE: 0.001
CRITERION: MSE


In [8]:
import math
import random 
import numpy as np 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F

In [9]:
os.environ['PYTHONHASHSEED']=str(seed_value) 
random.seed(seed_value) 
np.random.seed(seed_value) 
torch.manual_seed(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [10]:
import gym
# CartPole-v0 Environment
env_id = "CartPole-v0"
env = gym.make(env_id)
env.seed(seed_value);

In [11]:
USE_GPU = config['USE_GPU']

# Use CUDA
USE_CUDA = torch.cuda.is_available() and USE_GPU

if USE_CUDA:
    torch.cuda.manual_seed(seed_value)
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [12]:
print("PYTORCH: ", torch.__version__)
print("CUDA: ", torch.cuda.is_available())
print("DEVICE : ", device )

PYTORCH:  1.2.0
CUDA:  True
DEVICE :  cuda


In [13]:
# REPLAY BUFFER

from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        self.omnibuffer = deque() # no limit in memory. Remembers everything
    
    def push(self, state, action, reward, next_state, done):
        # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
        # a "fake" dimension to make it a mini-batch rather than a single observation
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
        self.omnibuffer.append((state, action, reward, next_state, done))

    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

class NaivePrioritizedBuffer(object):
    def __init__(self, capacity, prob_alpha=0.6):
        self.prob_alpha = prob_alpha
        self.capacity   = capacity
        self.buffer     = []
        self.omnibuffer = [] # no limit in memory. Remembers everything
        self.pos        = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)
    
    def push(self, state, action, reward, next_state, done):
        assert state.ndim == next_state.ndim
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        
        # Get max priorities
        # if self.buffer is empty, max_prio = maximum prioriities
        max_prio = self.priorities.max() if self.buffer else 1.0
        
        # Append new experience to buffer
        self.omnibuffer.append((state, action, reward, next_state, done))
        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)
        
        # All new experiences are given maximum priorities
        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity
    
    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]
        
        #Normalize probs
        probs  = prios ** self.prob_alpha
        probs /= probs.sum()
        
        # Choose indices to create a batch using probability=probs
        indices = np.random.choice(len(self.buffer), 
                                   size=batch_size, 
                                   p=probs)
        samples = [self.buffer[idx] for idx in indices]
        
        # Weights for Importance Sampling
        total    = len(self.buffer)
        weights  = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights  = np.array(weights, dtype=np.float32)
        
        batch       = list(zip(*samples))
        '''
        The * in a function call "unpacks" a list (or other iterable), 
        making each of its elements a separate argument.
        '''
        states      = np.concatenate(batch[0])
        actions     = batch[1]
        rewards     = batch[2]
        next_states = np.concatenate(batch[3])
        dones       = batch[4]
        
        return states, actions, rewards, next_states, dones, indices, weights
    
    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio

    def __len__(self):
        return len(self.buffer)

In [14]:
beta_start = 0.4
beta_timesteps = 1000 
beta_by_timestep = lambda timestep_idx: min(1.0, beta_start + timestep_idx * (1.0 - beta_start) / beta_timesteps)

In [15]:
# plt.plot([beta_by_timestep(i) for i in range(10000)])

In [16]:
class DQN(nn.Module): #base model
    def __init__(self, num_inputs, num_actions, HIDDEN_LAYER_WIDTH):
        super(DQN, self).__init__()
        
        self.action_dim = num_actions
        
        self.layers = nn.Sequential(
            nn.Linear(num_inputs, HIDDEN_LAYER_WIDTH),
            nn.ReLU(),
            nn.Linear(HIDDEN_LAYER_WIDTH, HIDDEN_LAYER_WIDTH),
            nn.ReLU(),
            nn.Linear(HIDDEN_LAYER_WIDTH, num_actions)
        )

    def forward(self, x):
        return self.layers(x)

In [17]:
class DuelingDQN(nn.Module):
    def __init__(self, num_inputs, num_actions, HIDDEN_LAYER_WIDTH):
        super(DuelingDQN, self).__init__()
        
        self.action_dim = num_actions
        
        self.feature = nn.Sequential(
            nn.Linear(num_inputs, HIDDEN_LAYER_WIDTH),
            nn.ReLU()
        )
        
        self.advantage = nn.Sequential(
            nn.Linear(HIDDEN_LAYER_WIDTH, HIDDEN_LAYER_WIDTH),
            nn.ReLU(),
            nn.Linear(HIDDEN_LAYER_WIDTH, num_actions)
        )
        
        self.value = nn.Sequential(
            nn.Linear(HIDDEN_LAYER_WIDTH, HIDDEN_LAYER_WIDTH),
            nn.ReLU(),
            nn.Linear(HIDDEN_LAYER_WIDTH, 1)
        )
        
    def forward(self, x):
        x = self.feature(x)
        advantage = self.advantage(x)
        value     = self.value(x)
        return value + advantage  - advantage.mean()
    

In [18]:
def act(policy_net, device, state, epsilon):
    policy_net.eval()
    with torch.no_grad():
        if random.random() > epsilon:
            state   = torch.FloatTensor(state).unsqueeze(dim=0).to(device)
            q_values = policy_net(state)
            action  = q_values.max(dim=1)[1].item()
        else:
            action = random.randrange(policy_net.action_dim)
    policy_net.train()
    return action

In [19]:
# e-greedy exploration

epsilon_start = config['EPSILON_START']
epsilon_final = config['EPSILON_FINAL']
epsilon_decay = config['EPSILON_DECAY']

epsilon_by_timestep = lambda timestep_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * timestep_idx / epsilon_decay)

In [20]:
# plt.plot([epsilon_by_timestep(i) for i in range(10000)])

In [21]:
# MODEL
if (config['MODEL_NAME']=='D1QN'):
    # only one NN for estimating Q-values
    policy_net = DQN(env.observation_space.shape[0], 
                 env.action_space.n,
                 config['HIDDEN_LAYER_WIDTH'])
    policy_net = policy_net.to(device)


elif (config['MODEL_NAME']=='DQN' or config['MODEL_NAME']=='D2QN'):
    # one policy_net and one target_net
    policy_net = DQN(env.observation_space.shape[0], 
                 env.action_space.n,
                 config['HIDDEN_LAYER_WIDTH'])
    policy_net = policy_net.to(device)

    target_net = DQN(env.observation_space.shape[0], 
                 env.action_space.n,
                 config['HIDDEN_LAYER_WIDTH'])
    target_net = target_net.to(device)

elif (config['MODEL_NAME']=='DuDQN' or config['MODEL_NAME']=='DuD2QN'):
    # one policy_net and one target_net
    policy_net = DuelingDQN(env.observation_space.shape[0], 
                 env.action_space.n,
                 config['HIDDEN_LAYER_WIDTH'])
    policy_net = policy_net.to(device)

    target_net = DuelingDQN(env.observation_space.shape[0], 
                 env.action_space.n,
                 config['HIDDEN_LAYER_WIDTH'])
    target_net = target_net.to(device)
    
else: #default policy_net is D1QN
    # only one NN for estimating Q-values
    policy_net = DQN(env.observation_space.shape[0], 
                 env.action_space.n,
                 config['HIDDEN_LAYER_WIDTH'])    
    policy_net = policy_net.to(device)


# print(policy_net)
# summary(policy_net, 
#         input_size=(env.observation_space.shape[0],),
#         batch_size=config['BATCH_SIZE'], 
#         device='cuda' if USE_CUDA else 'cpu' )

In [22]:
# OPTIMIZER
if (config['OPTIMIZER']=='Adam'):
    optimizer = optim.Adam(policy_net.parameters(), 
                           lr=config['LEARNING_RATE'])
elif (config['OPTIMIZER']=='SGD'):
    optimizer = optim.SGD(policy_net.parameters(), 
                           lr=config['LEARNING_RATE'])
else: #default optimizer is Adam
    optimizer = optim.Adam(policy_net.parameters(), 
                           lr=config['LEARNING_RATE'])

In [23]:
# CRITERION
if (config['CRITERION']=='MSE'):
    criterion = nn.MSELoss()
elif (config['CRITERION']=='HUBER'):
    criterion = nn.SmoothL1Loss()
else: #default criterion is MSELoss
    criterion = nn.MSELoss()

In [24]:
# REPLAY BUFFER
if (config['REPLAY_BUFFER'] == 'Naive'):
    replay_buffer = ReplayBuffer(capacity=config['REPLAY_BUFFER_SIZE'])
elif (config['REPLAY_BUFFER'] == 'NaivePER'):
    replay_buffer = NaivePrioritizedBuffer(capacity=config['REPLAY_BUFFER_SIZE'])
else:
    replay_buffer = ReplayBuffer(capacity=config['REPLAY_BUFFER_SIZE'])

In [25]:
def update_target(policy_net, target_net):
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    
def compute_td_loss(batch_size, beta=1.0):
    if (config['REPLAY_BUFFER'] == 'Naive'):
        state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    elif (config['REPLAY_BUFFER'] == 'NaivePER'):
        state, action, reward, next_state, done, indices, weights = replay_buffer.sample(batch_size, beta) 
    else:
        state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = torch.FloatTensor(np.float32(state)).to(device)
    next_state = torch.FloatTensor(np.float32(next_state)).to(device)
    action     = torch.LongTensor(action).to(device)
    reward     = torch.FloatTensor(reward).to(device)
    done       = torch.FloatTensor(done).to(device)
    if (config['REPLAY_BUFFER'] == 'NaivePER'):
        weights = torch.FloatTensor(weights).to(device)

    policy_net.train()
    q_values = policy_net(state)
    q_value  = q_values.gather(dim=1, index=action.unsqueeze(dim=1)).squeeze(dim=1)

    #next_q_value
    with torch.no_grad():
        if (config['MODEL_NAME']=='D1QN'):
            next_q_values = policy_net(next_state)
            next_q_value  = next_q_values.max(dim=1)[0]
            
        elif (config['MODEL_NAME']=='DQN' or config['MODEL_NAME']=='DuDQN'):
            target_net.eval()
            next_q_values = target_net(next_state)
            next_q_value  = next_q_values.max(dim=1)[0]

        elif (config['MODEL_NAME']=='D2QN' or config['MODEL_NAME']=='DuD2QN'):
            target_net.eval()
            next_q_values = policy_net(next_state) #all q-values from current policy_net
            next_q_target_values = target_net(next_state) #all q-values from target_net
            next_q_value = next_q_target_values.gather(dim=1, 
                                                      index=torch.max(next_q_values, dim=1)[1].unsqueeze(dim=1)).squeeze(dim=1)
            #q-values from target_net by acting greedily on current policy_net (double dqn)

        else: #Default is D1QN
            next_q_values = policy_net(next_state)
            next_q_value  = next_q_values.max(dim=1)[0]
    
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    

    # Calculate Loss WITHOUT mean reduction
    if (config['CRITERION']=='MSE'):
        loss = F.mse_loss(q_value, expected_q_value.detach(), reduction='none')
    elif (config['CRITERION']=='HUBER'):
        loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none')
    else: #default criterion is MSELoss
        loss = F.mse_loss(q_value, expected_q_value.detach(), reduction='none')
    
    if (config['REPLAY_BUFFER'] == 'NaivePER'): #Importance Sampling
        loss  = loss * weights
        prios = loss + 1e-5
   
    loss = loss.mean()
    optimizer.zero_grad()
    loss.backward()
    if (config['REPLAY_BUFFER'] == 'NaivePER'):
        replay_buffer.update_priorities(indices, prios.detach().cpu().numpy())
    optimizer.step()
    
    return loss

In [26]:
if (config['MODEL_NAME']=='DQN' or 
    config['MODEL_NAME']=='D2QN' or 
    config['MODEL_NAME']=='DuDQN' or
    config['MODEL_NAME']=='DuD2QN'):
    update_target(policy_net, target_net)

In [27]:
# Training
num_timesteps = config['TIMESTEPS']
batch_size = config['BATCH_SIZE']
gamma      = config['GAMMA']

episode_len = 0
state = env.reset()
reward = 0
for timestep_idx in range(1, num_timesteps + 1):
    epsilon = epsilon_by_timestep(timestep_idx)
    action = act(policy_net, device, state, epsilon)
    
    next_state, x_reward, done, _ = env.step(action)
    episode_len += x_reward
    
    reward = 1 if (done and episode_len >= 195) else 0
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state

    
    if done:
        if log_with_tensorboard:
            writer.add_scalar('episode_len', episode_len, global_step=timestep_idx)
        state = env.reset()    
        episode_len = 0
        
    if len(replay_buffer) > batch_size:
       
        if (config['REPLAY_BUFFER'] == 'Naive'):
            beta = 1
            loss = compute_td_loss(batch_size, beta)
            
        elif (config['REPLAY_BUFFER'] == 'NaivePER'):
            beta = beta_by_timestep(timestep_idx)
            loss = compute_td_loss(batch_size, beta)
            
        else:
            loss = compute_td_loss(batch_size)
            
        if log_with_tensorboard:
            writer.add_scalar('loss', loss.item(), global_step=timestep_idx)
        
#     if log_with_tensorboard:
#         if timestep_idx % 1000 == 0:
#             for name, param in policy_net.named_parameters():
#                 if param.requires_grad:
#                     writer.add_histogram('policy_net_'+ name, param.data, global_step=timestep_idx)

#                 if (config['MODEL_NAME']=='DQN' or 
#                     config['MODEL_NAME']=='D2QN' or 
#                     config['MODEL_NAME']=='DuDQN' or
#                     config['MODEL_NAME']=='DuD2QN'):
#                     for name, param in target_net.named_parameters():
#                         if param.requires_grad:
#                             writer.add_histogram('target_net_'+ name, param.data, global_step=timestep_idx)        

    if (config['MODEL_NAME']=='DQN' or 
        config['MODEL_NAME']=='D2QN' or 
        config['MODEL_NAME']=='DuDQN' or
        config['MODEL_NAME']=='DuD2QN'):
        if timestep_idx % config['TARGET_UPDATE_FREQ'] == 0:
            update_target(policy_net, target_net)

In [28]:
import pickle

# Save Memories
MEM_DIR = os.path.join(os.curdir,'memories',config['MODEL_NAME'])
if not os.path.exists(MEM_DIR):
    os.makedirs(MEM_DIR)
MEM_FILE = os.path.join(MEM_DIR, log_name + '.mpk')
with open(MEM_FILE, 'wb') as fpw:
    pickle.dump(replay_buffer.omnibuffer, fpw)

# # Load Memories
# with open(MEM_FILE, 'rb') as fpr:
#     memories = pickle.load(fpr)
            
# Save Learned Model Parameters
MODEL_DIR = os.path.join(os.curdir,'models',config['MODEL_NAME'])
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
MODEL_FILE = os.path.join(MODEL_DIR, log_name + '.pth')
torch.save(policy_net.state_dict(), MODEL_FILE)

# # Load Learned Model Parameters
# model = TheModelClass(*args, **kwargs)
# model.load_state_dict(torch.load(MODEL_FILE))
# model.eval()

# Flush events to Tensorboard
if log_with_tensorboard:
    writer.flush()
    writer.close()