In [22]:
import random
import datetime
import os

import numpy as np

import torch
from torch import nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter


In [23]:
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
cudnn.benchmark = True

In [24]:
class Episode_Experience():
    def __init__(self):
        self.memory = []

    def add(self, state, action, reward, next_state, done, goal):
        self.memory += [(state, action, reward, next_state, done, goal)]

    def clear(self):
        self.memory = []

In [25]:
class BitFlip():
    def __init__(self, bit_length=50, max_steps=50):
        self.bit_length = bit_length
        self.max_steps = max_steps
        self._reset()

    def _terminate(self):
        if np.array_equal(self.state, self.goal):
            return 1
        else:
            return 0  

    def _reward(self):
        if np.array_equal(self.state, self.goal):
            return 0
        else:
            return -1  

    def _step(self, action):
        self.steps += 1

        change = np.zeros(len(self.state))
        change[action] = 1
        if self.state[action] == 0:
            self.state = self.state + change
        elif  self.state[action] == 1:
            self.state = self.state - change
            
        return self.state, self._reward(), self._terminate()

    def _reset(self):
        self.steps = 0
        self.state = np.random.randint(2, size=(self.bit_length))

        # make sure goal is not the initial state
        self.goal = self.state
        while np.array_equal(self.state, self.goal):
            self.goal = np.random.randint(2, size=(self.bit_length))

        return self.state, self.goal


In [26]:
class DQN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DQN, self).__init__()

        self.hidden = nn.Linear(input_size, hidden_size)
        nn.init.kaiming_normal_(self.hidden.weight)
        self.hidden.bias.data.zero_()

        self.output = nn.Linear(hidden_size, output_size)
        nn.init.kaiming_normal_(self.output.weight)
        self.output.bias.data.zero_()

    def forward(self, x):
        x = F.relu(self.hidden(x))
        return self.output(x)

In [27]:
class Agent():
    def __init__(self, state_size, action_size, goal_size, model_out_path, clip_target_value=True):
        self.state_size = state_size
        self.goal_size = goal_size
        self.action_size = action_size
        self.clip_target_value = clip_target_value
        self.memory = []

        # hyperparameters for the agent and HER
        # as same as the paper: https://arxiv.org/pdf/1707.01495.pdf 's Appendix A
        self.epsilon = 0.2  # exploration
        self.epsilon_min = 0.05  # min exploration
        self.epsilon_decay = 0.001
        self.tau = 0.95  # target net update weight
        self.gamma = 0.98
        self.batch_size = 512
        self.learning_rate = 1e-3
        self.buffer_size = int(1e6)
        self.hidden_size = 256

        self.model_out_path = model_out_path
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.criterion = None
        self.optimizer = None
        self.writer = SummaryWriter(self.model_out_path + '/tensorboard')
        self.build_model()


    # build the double DQN
    def build_model(self):  
        print('     Building double DQN')
        
        self.DQN_eval = DQN(input_size=self.state_size + self.goal_size, hidden_size=self.hidden_size, output_size=self.action_size).to(self.device).train()
        self.DQN_target = DQN(input_size=self.state_size + self.goal_size, hidden_size=self.hidden_size, output_size=self.action_size).to(self.device).eval()
        self.DQN_target.load_state_dict(self.DQN_eval.state_dict())

        self.criterion = nn.MSELoss().to(self.device)
        self.optimizer = optim.Adam(self.DQN_eval.parameters(), lr=self.learning_rate)
        
        
    def choose_action(self, state, goal):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        else:
            state = torch.tensor(state)
            goal =  torch.tensor(goal)
            input = torch.concat((state,goal), 0).type(torch.FloatTensor).to(self.device)
            act_values = self.DQN_eval(input) # tensor, len=50
            
            return torch.argmax(act_values).item() 


    def remember(self, ep_experience):
        print('     Storing experience')
        self.memory += ep_experience.memory
        if len(self.memory) > self.buffer_size:
            self.memory = self.memory[-self.buffer_size:]  # empty the first memories


    def replay(self, epoch, optimization_steps):
        print('\n===> Replaying experience')

        if len(self.memory) < self.batch_size:  # if there's no enough transitions, do nothing
            print('\n===> data in memory is too small')
            return 0

        losses = 0
        for _ in range(optimization_steps):

            # transform the list into multiple ndarrays
            minibatch = random.sample(self.memory, self.batch_size) # list, len=128, element:state, action, reward, next_state, done, goal
            ss = np.zeros((self.batch_size, self.state_size)) # state
            ass = np.zeros(self.batch_size)
            rs = np.zeros(self.batch_size) # reward
            nss = np.zeros((self.batch_size, self.state_size)) # next_state
            ds = np.zeros(self.batch_size) # done
            gs = np.zeros((self.batch_size, self.state_size)) # goal
            for i in range(self.batch_size):
                ss[i] = minibatch[i][0]
                ass[i] = minibatch[i][1]
                rs[i] = minibatch[i][2]
                nss[i] = minibatch[i][3]
                ds[i] = minibatch[i][4]
                gs[i] = minibatch[i][5]

            # Q_EVAL(st, at)
            ass = torch.tensor(ass).type(torch.int64).to(self.device).unsqueeze(1) # torch.Size([128, 1])
            q_eval = torch.tensor(np.concatenate((ss, gs), axis=1)).type(torch.FloatTensor).to(self.device)
            q_eval = self.DQN_eval(q_eval) # torch.Size([128, 50])
            pred =  q_eval.gather(index = ass, dim = 1) # tensor, len=128
            if self.clip_target_value and epoch >= 16:
                pred = torch.clamp(pred, min=-1 / (1 - self.gamma), max=0) 
            pred = pred.squeeze(1) # tensor, len=128
            # print(pred)
            
            # Q_TARGET(st+1, at+1)
            q_target_next_state = torch.tensor(np.concatenate((nss, gs), axis=1)).type(torch.FloatTensor).to(self.device)
            q_target_next_state = self.DQN_target(q_target_next_state)
            q_target_next_state = torch.max(q_target_next_state, dim=1)[0] # tensor, len=128
            rs = torch.tensor(rs).type(torch.FloatTensor).to(self.device)
            ds = torch.tensor(ds).type(torch.FloatTensor).to(self.device)
            target = rs + self.gamma * (1 - ds) * q_target_next_state
            if self.clip_target_value and epoch >= 16:
                target = torch.clamp(target, min=-1 / (1 - self.gamma), max=0) # tensor, len=128
            # print(target)

            # train DQN eval
            self.optimizer.zero_grad()
            mse_loss = self.criterion(pred, target)
            mse_loss.backward()
            self.optimizer.step()
            losses += mse_loss.item()

        self.writer.add_scalar(tag="loss", scalar_value=losses / optimization_steps, global_step=epoch)
        self.writer.add_scalar(tag="lr", scalar_value=self.optimizer.state_dict()['param_groups'][0]['lr'], global_step=epoch)


    def update_target_net(self, epoch, mode='RESET', decay=True):

        if mode == 'EMA':
            print('\n===> Updating target DQN by EMA')
            new_dict = {}
            for key, t, e in zip(self.DQN_target.state_dict().keys(), self.DQN_target.state_dict().values(), self.DQN_eval.state_dict().values()):
                new_dict[key] = self.tau * e + (1 - self.tau) * t # tau = 0.95
            self.DQN_target.load_state_dict(new_dict)
        elif mode == 'RESET':
            print('\n===> Updating target DQN by RESET')
            self.DQN_target.load_state_dict(self.DQN_eval.state_dict())

        print('\n===> Saving target DQN')
        checkpoint={
            'epoch':epoch,
            'net_state_dict':self.DQN_target.state_dict(),
            'opt_state_dict':self.optimizer.state_dict(),
                    }
        checkpoint_out_path = self.model_out_path +'/checkpoint/'
        torch.save(checkpoint, checkpoint_out_path + str(epoch) + '_checkpoint.pkl')
        
        if decay:
            self.epsilon = max(self.epsilon - self.epsilon_decay, 0)

In [28]:
if __name__ == "__main__":

    # to store different model information
    current_path = os.getcwd()
    result_path = current_path + '/result/'
    if os.path.exists(result_path) == False:
        os.mkdir(result_path)
    now = datetime.datetime.now()
    now = now.strftime("%Y-%m-%d_%H:%M:%S")
    model_out_path =  result_path + now
    if os.path.exists(model_out_path) == False:
        os.mkdir(model_out_path)
    checkpoints_out_path = model_out_path +'/checkpoint/'
    if os.path.exists(checkpoints_out_path) == False:
        os.mkdir(checkpoints_out_path)
    writer_out_path = model_out_path + '/tensorboard'
    if os.path.exists(writer_out_path) == False:
        os.mkdir(writer_out_path)
    
    # hyperparameters for the agent and HER
    n = 50
    num_epochs = 200
    num_cycles = 50
    num_episodes = 16
    num_explorations = 10000
    optimization_steps = 40
    K = 1

    ep_experience = Episode_Experience()
    ep_experience_her = Episode_Experience()
    env = BitFlip(bit_length=n, max_steps=n)
    agent = Agent(state_size=n, action_size=n, goal_size=n, model_out_path=model_out_path, clip_target_value=True)

    losses = []
    success_rate = []

    for epoch in range(1,num_epochs*num_cycles+1):
        print('\n===> Epoch {} starts'.format(epoch))
        successes = 0
        distances = 0

        # step 1: get the dataset
        for eposode in range(1, num_episodes+1):
            print('\n===> Episode {} starts'.format(eposode))
            state, goal = env._reset() 
            done = 0

            # step 1.1: get the original dataset
            print('     Getting the original dataset')
            for t in range(num_explorations):
                action = agent.choose_action(state, goal)
                next_state, reward, done = env._step(action) 
                distances += np.sum(np.abs(next_state-goal)) # check the progresss of DQN
                ep_experience.add(state, action, reward, next_state, done, goal) # ndarray, int, int, ndarray, int, ndarray
                state = next_state
                if np.array_equal(state, goal):
                    successes += 1
                    break

            # step 1.2: get the HER dataset
            print('     Getting the HER dataset')
            for t in range(len(ep_experience.memory)):
                for k in range(t,t+K):
                    # choice = np.random.randint(t, len(ep_experience.memory))  # future strategy in HER
                    choice = t # my strategy:next
                    goal = ep_experience.memory[choice][3]  # next_state of future
                    state = ep_experience.memory[t][0]
                    action = ep_experience.memory[t][1]
                    next_state = ep_experience.memory[t][3]
                    # print(np.sum(np.abs(next_state-goal)))
                    done = 1 if np.array_equal(next_state, goal) else 0
                    reward = 0 if done else -1
                    ep_experience_her.add(state, action, reward, next_state, done, goal)

            # step 1.3: transfer the datasets to the agent
            print('     Transferring the datasets to the agent')
            agent.remember(ep_experience)
            agent.remember(ep_experience_her)
            ep_experience.clear()
            ep_experience_her.clear()

        # step 2: train the eval DQN
        agent.replay(epoch=epoch, optimization_steps=optimization_steps)

        # step 3: update and save the target DQN
        agent.update_target_net(epoch=epoch, mode='EMA', decay=False)

        # step 4: store the history
        agent.writer.add_scalar(tag="distance",scalar_value=distances / num_episodes / num_explorations, global_step=epoch)
        agent.writer.add_scalar(tag="success rate",scalar_value=successes / num_episodes, global_step=epoch)
        agent.writer.add_scalar(tag="epsilon",scalar_value=agent.epsilon, global_step=epoch)

     Building double DQN

===> Epoch 1 starts

===> Episode 1 starts
     Getting the original dataset
     Getting the HER dataset
     Transferring the datasets to the agent
     Storing experience
     Storing experience

===> Episode 2 starts
     Getting the original dataset
     Getting the HER dataset
     Transferring the datasets to the agent
     Storing experience
     Storing experience

===> Episode 3 starts
     Getting the original dataset
     Getting the HER dataset
     Transferring the datasets to the agent
     Storing experience
     Storing experience

===> Episode 4 starts
     Getting the original dataset
     Getting the HER dataset
     Transferring the datasets to the agent
     Storing experience
     Storing experience

===> Episode 5 starts
     Getting the original dataset
     Getting the HER dataset
     Transferring the datasets to the agent
     Storing experience
     Storing experience

===> Episode 6 starts
     Getting the original dataset
     Ge