In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
import random
import math
from torch.utils.tensorboard import SummaryWriter
from collections import deque, namedtuple
import time
import gym
import logging

def weight_init(layers):
    for layer in layers:
        torch.nn.init.kaiming_normal_(layer.weight, nonlinearity='relu')

In [36]:
import numpy as np
import gym
from gym import spaces
import random


class BitFlip(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
    }

    def __init__(self, bit_length=16, max_steps=None, mean_zero=False):
        super(BitFlip, self).__init__()
        if bit_length < 1:
            raise ValueError('bit_length must be >= 1, found {}'.format(bit_length))
        self.bit_length = bit_length
        self.mean_zero = mean_zero

        if max_steps is None:
            # default to bit_length
            self.max_steps = bit_length
        elif max_steps == 0:
            self.max_steps = None
        else:
            self.max_steps = max_steps

        # spaces documentation: https://gym.openai.com/docs/
        self.action_space = spaces.Discrete(bit_length)
        self.observation_space = spaces.Dict({
            'state': spaces.Box(low=0, high=1, shape=(bit_length, )),
            'goal': spaces.Box(low=0, high=1, shape=(bit_length, )),
        })

        self._reset()

    def _terminate(self):
        return (self.state == self.goal).all() or self.steps >= self.max_steps

    def _reward(self):
        return -1 if (self.state != self.goal).any() else 0

    def _step(self, action):
        # action is an int in the range [0, self.bit_length)
        self.state[action] = int(not self.state[action])
        self.steps += 1

        return (self._get_obs(), self._reward(), self._terminate(), {})

    def _reset(self):
        self.steps = 0

        self.state = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])

        # make sure goal is not the initial state
        self.goal = self.state
        while (self.goal == self.state).all():
            self.goal = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])

        return self._get_obs()

    def _mean_zero(self, x):
        if self.mean_zero:
            return (x - 0.5) / 0.5
        else:
            return x


    def _get_obs(self):
        return {
            'state': self._mean_zero(self.state),
            'goal': self._mean_zero(self.goal),
        }

    def _render(self, mode='human', close=False):
        pass

In [37]:
class DDQN(nn.Module):
    def __init__(self, state_size, action_size,layer_size, n_step, seed, layer_type="ff"):
        super(DDQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size

        self.head_1 = nn.Linear(self.input_shape[0]*2, layer_size)
        self.ff_1 = nn.Linear(layer_size, layer_size)
        self.ff_2 = nn.Linear(layer_size, action_size)
        weight_init([self.head_1, self.ff_1])
    
    def forward(self, input):
        """
        
        """
        x = torch.relu(self.head_1(input))
        x = torch.relu(self.ff_1(x))
        out = self.ff_2(x)
        
        return out

In [44]:
class HER_ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, device, seed, gamma, n_step=1):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.device = device
        self.HER_samples = 4
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.current_episode = []
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.n_step = n_step
        self.n_step_buffer = deque(maxlen=self.n_step)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        
        self.current_episode.append([state,action,reward,next_state, done])
        if done == 1:
            # AUGMENT SAMPLES WITH HER
            for idx, exp in enumerate(self.current_episode):
                state, action, reward, next_state, done = exp
                # Save experience in replay memory 
                self.n_step_buffer.append((state, action, reward, next_state, done))
                if len(self.n_step_buffer) == self.n_step:
                    nstep_s, nstep_a, nstep_r, nstep_ns, nstep_d = self.calc_multistep_return(self.n_step_buffer)
                    e = self.experience(nstep_s, nstep_a, nstep_r, nstep_ns, nstep_d)
                    self.memory.append(e)
                
                # sample additional goals for HER
                new_goals = self.sample_goals(idx,self.HER_samples)
                
                for new_goal in new_goals:     
                    # calculate reward for the new goal state 
                    r_ = self.reward_function(state[:BIT_LENGTH], new_goal)
                    # concatenate state and next_state with new goal_state
                    next_state = np.concatenate((next_state[:BIT_LENGTH],new_goal))
                    state = np.concatenate((state[:BIT_LENGTH], new_goal))

                    if (next_state[:BIT_LENGTH] == new_goal).all():
                        d = 1
                    else:
                        d = 0
                    # add new goal with nstep bootstrapping
                    if len(self.n_step_buffer) == self.n_step:
                        previous_steps = list(self.n_step_buffer)[:-1]
                        previous_steps.append((state, action, r_, next_state, d))
                        nstep_s, nstep_a, nstep_r, nstep_ns, nstep_d = self.calc_multistep_return(previous_steps)
                        e = self.experience(nstep_s, nstep_a, nstep_r, nstep_ns, nstep_d)
                        self.memory.append(e)
            
            self.current_episode = []
        
        
    
    def reward_function(self, state, goal):
        return 0 if (state == goal).all() else -1
                
    def sample_goals(self,idx, n):
        new_goals = []
        for _ in range(n):
            transition = random.choice(self.current_episode[idx:])
            new_goal = transition[0][:BIT_LENGTH]
            new_goals.append(new_goal)
        return new_goals
    
    def calc_multistep_return(self, n_step_buffer):
        Return = 0
        for idx in range(self.n_step):
            Return += self.gamma**idx * n_step_buffer[idx][2]
        
        return n_step_buffer[0][0], n_step_buffer[0][1], Return, n_step_buffer[-1][3], n_step_buffer[-1][4]
        
    
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [45]:
class DQN_Agent():
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 Network,
                 layer_size,
                 n_step,
                 BATCH_SIZE,
                 BUFFER_SIZE,
                 LR,
                 TAU,
                 GAMMA,
                 UPDATE_EVERY,
                 device,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            Network (str): dqn network type
            layer_size (int): size of the hidden layer
            BATCH_SIZE (int): size of the training batch
            BUFFER_SIZE (int): size of the replay memory
            LR (float): learning rate
            TAU (float): tau for soft updating the network weights
            GAMMA (float): discount factor
            UPDATE_EVERY (int): update frequency
            device (str): device that is used for the compute
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.TAU = TAU
        self.GAMMA = GAMMA
        self.UPDATE_EVERY = UPDATE_EVERY
        self.BATCH_SIZE = BATCH_SIZE
        self.Q_updates = 0
        self.n_step = n_step
        self.current_episode = []
        self.action_step = 4
        self.last_action = None

        # Q-Network
        self.qnetwork_local = DDQN(state_size, action_size,layer_size, n_step, seed).to(device)
        self.qnetwork_target = DDQN(state_size, action_size,layer_size, n_step, seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        print(self.qnetwork_local)
        
        # Replay memory
        self.memory = HER_ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device, seed, self.GAMMA, n_step)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done, writer, current_goal):

        # Save experience in replay memory 
        self.memory.add(state, action, reward, next_state, done)

        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > self.BATCH_SIZE:
            experiences = self.memory.sample()
            loss = self.learn(experiences)
            self.Q_updates += 1
            writer.add_scalar("Q_loss", loss, self.Q_updates)

    
    def reward_function(self, state, goal):
        return 0 if (state == goal).all() else -1
                
    def sample_goals(self,idx, n):
        new_goals = []
        for _ in range(n):
            transition = random.choice(self.current_episode[idx:])
            new_goal = transition[0][:BIT_LENGTH]
            new_goals.append(new_goal)
        return new_goals
                
                
    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy. Acting only every 4 frames!
        
        Params
        ======
            frame: to adjust epsilon
            state (array_like): current state
            
        """
        state = np.array(state)
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps: # select greedy action if random number is higher than epsilon or noisy network is used!
            action = np.argmax(action_values.cpu().data.numpy())
            self.last_action = action
            return action
        else:
            action = random.choice(np.arange(self.action_size))
            self.last_action = action 
            return action



    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        self.optimizer.zero_grad()
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (self.GAMMA**self.n_step * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets) #mse_loss
        # Minimize the loss
        loss.backward()
        clip_grad_norm_(self.qnetwork_local.parameters(),1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)
        return loss.detach().cpu().numpy()            

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.TAU*local_param.data + (1.0-self.TAU)*target_param.data)


In [50]:
def run(frames=1000, eps_fixed=False, eps_frames=1e6, min_eps=0.01):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    output_history = []
    frame = 0
    if eps_fixed:
        eps = 0
    else:
        eps = 1
    eps_start = 1
    i_episode = 1
    dic = env._reset()
    state = dic["state"]
    goal_state = dic["goal"]
    state = np.concatenate((state, goal_state))
    score = 0                  
    for frame in range(1, frames+1):

        action = agent.act(state, eps)
        SandG, reward, done, _ = env._step(action)
        next_state = SandG["state"]
        next_state = np.concatenate((next_state, goal_state))
        
        agent.step(state, action, reward, next_state, done, writer, SandG["goal"])
        state = next_state
        score += reward
        
        
        if eps_fixed == False:
            if frame < eps_frames:
                eps = max(eps_start - (frame*(1/eps_frames)), min_eps)
            else:
                eps = max(min_eps - min_eps*((frame-eps_frames)/(frames-eps_frames)), 0.001)
        if done:
            scores_window.append(score)       # save most recent score
            scores.append(score)              # save most recent score
            writer.add_scalar("Epsilon", eps, i_episode)
            writer.add_scalar("Reward", score, i_episode)
            writer.add_scalar("Average100", np.mean(scores_window), i_episode)
            output_history.append(np.mean(scores_window))
            print('\rEpisode {}\tFrame {} \tAverage Score: {:.2f}'.format(i_episode, frame, np.mean(scores_window)), end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tFrame {}\tAverage Score: {:.2f}'.format(i_episode,frame, np.mean(scores_window)))
            i_episode +=1 
            dic = env._reset()
            state = dic["state"]
            goal_state = dic["goal"]
            state = np.concatenate((state, goal_state))
            score = 0              

    return output_history


if __name__ == "__main__":
    
    writer = SummaryWriter("runs/"+"BF_HER_4_")
    
    FORMAT = "%(levelname)s %(asctime)s - %(message)s"
    logging.basicConfig(filename="Her_Log.log", level=logging.DEBUG,format=FORMAT,filemode="w")
    logger = logging.getLogger()
    
    seed = 3
    BUFFER_SIZE = 100000
    BATCH_SIZE = 128
    GAMMA = 0.98
    TAU = 1e-2
    LR = 1e-3
    UPDATE_EVERY = 1
    n_step = 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using ", device)

    BIT_LENGTH = 25
    np.random.seed(seed)
    env = BitFlip(bit_length=BIT_LENGTH)

    env.seed(seed)
    action_size     = env.action_space.n
    state_size = env.observation_space["state"].shape

    agent = DQN_Agent(state_size=state_size,    
                        action_size=action_size,
                        Network="DDQN",
                        layer_size=256,
                        n_step=n_step,
                        BATCH_SIZE=BATCH_SIZE, 
                        BUFFER_SIZE=BUFFER_SIZE, 
                        LR=LR, 
                        TAU=TAU, 
                        GAMMA=GAMMA, 
                        UPDATE_EVERY=UPDATE_EVERY, 
                        device=device, 
                        seed=seed)



    # set epsilon frames to 0 so no epsilon exploration
    eps_fixed = False

    t0 = time.time()
    final_average100 = run(frames = 35000, eps_fixed=eps_fixed, eps_frames=8000, min_eps=0.025)
    t1 = time.time()
    
    print("Training time: {}min".format(round((t1-t0)/60,2)))
    torch.save(agent.qnetwork_local.state_dict(), "BF_DQN_HER"+".pth")


Using  cpu
DDQN(
  (head_1): Linear(in_features=50, out_features=256, bias=True)
  (ff_1): Linear(in_features=256, out_features=256, bias=True)
  (ff_2): Linear(in_features=256, out_features=25, bias=True)
)
Episode 100	Frame 2500	Average Score: -25.000
Episode 200	Frame 5000	Average Score: -25.000
Episode 300	Frame 7500	Average Score: -25.000
Episode 400	Frame 10000	Average Score: -25.000
Episode 500	Frame 12500	Average Score: -25.000
Episode 600	Frame 15000	Average Score: -25.000
Episode 700	Frame 17500	Average Score: -25.000
Episode 800	Frame 20000	Average Score: -25.000
Episode 900	Frame 22483	Average Score: -24.811
Episode 1000	Frame 24938	Average Score: -24.522
Episode 1100	Frame 27403	Average Score: -24.622
Episode 1200	Frame 29861	Average Score: -24.555
Episode 1300	Frame 32227	Average Score: -23.544
Episode 1400	Frame 34542	Average Score: -22.988
Episode 1422	Frame 34999 	Average Score: -22.35Training time: 3.99min
