In [3]:
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
from pyvirtualdisplay import Display
from IPython import display as disp

In [None]:
display = Display(visible=0,size=(600,600))
display.start()
device = torch.device('cpu')

In [None]:
plot_interval = 10 # update the plot every N episodes
video_every = 100 # videos can take a very long time to render so only do it every N episodes

env = gym.make("BipedalWalker-v3")
# env = gym.make("BipedalWalkerHardcore-v3") # only attempt this when your agent has solved BipedalWalker-v3
env = gym.wrappers.Monitor(env, "./video", video_callable=lambda ep_id: ep_id%video_every == 0, force=True)

In [None]:
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [None]:
class Actor(nn.Module):
	def __init__(self, state_dim, action_dim, max_action, net_width=256):
		super(Actor, self).__init__()
		self.max_action = max_action

		self.l1 = nn.Linear(state_dim, net_width)
		self.l2 = nn.Linear(net_width, net_width)
		self.l3 = nn.Linear(net_width, action_dim)		

	def forward(self, state):
		a = F.relu(self.l1(state))
		a = F.relu(self.l2(a))
		return torch.tanh(self.l3(a)) * self.max_action

In [None]:
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, net_width=256):
        super(Critic, self).__init__()
		# Q1
        self.l1 = nn.Linear(state_dim + action_dim, net_width)
        self.l2 = nn.Linear(net_width, net_width)
        self.l3 = nn.Linear(net_width, 1)

		# Q2
        self.l4 = nn.Linear(state_dim + action_dim, net_width)
        self.l5 = nn.Linear(net_width, net_width)
        self.l6 = nn.Linear(net_width, 1)

    def Q1(self, state, action):
        # Q1
        sa = torch.cat([state, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

    def forward(self, state, action):
        # Q1 and Q2
        sa = torch.cat([state, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

In [None]:
class ReplayBuffer(object):
    '''
    Based on: https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
    '''
    def __init__(self, max_size=1e6):
        self._storage = []
        self._maxsize = max_size
        self._next_idx = 0

    def add(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)
        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

    def sample(self, batch_size):
        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in idxes:
            data = self._storage[i]
            state, action, reward, next_state, done = data
            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(np.array(reward, copy=False))
            next_states.append(np.array(next_state, copy=False))
            dones.append(np.array(done, copy=False))
        return np.array(states), np.array(actions), np.array(rewards).reshape(-1, 1), np.array(next_states), np.array(dones).reshape(-1, 1)


In [None]:

gamma = 0.99
batch_size = 100
policy_noise = 0.2
noise_clip = 0.5
policy_freq = 2
tau = 0.005

In [None]:
class Agent():
    def __init__(self):
        self.actor = Actor(obs_dim, act_dim, max_action, 256).to(device)
        self.actor_target = Actor(obs_dim, act_dim, max_action, 256).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.001)

        self.critic = Critic(obs_dim, act_dim, 256).to(device)
        self.critic_target = Critic(obs_dim, act_dim, 256).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.001)

        self.max_action = max_action

    def select_action(self, state):
        with torch.no_grad():
            state = torch.FloatTensor(state.reshape(1, -1)).to(device)
            a = self.actor(state).cpu().data.numpy().flatten()
        return a

    def train(self, replay_buffer, iterations):
    # def train(self, replay_buffer, iterations, weight, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        # Sample replay buffer
        for i in range(iterations):
            state, action, reward, next_state, done = replay_buffer.sample(batch_size)
            state = torch.FloatTensor(state).to(device)
            action = torch.FloatTensor(action).to(device)
            reward = torch.FloatTensor(reward).to(device)
            next_state = torch.FloatTensor(next_state).to(device)
            done = torch.FloatTensor(done).to(device)
            
            noise = torch.randn_like(action) * policy_noise
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            target_q1, target_q2 = self.critic_target(next_state, next_action)
            target_q = torch.min(target_q1, target_q2)
            target_q = reward + ((1.0 - done) * gamma * target_q).detach()

            current_q1, current_q2 = self.critic(state, action)
            # critic_loss = (F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)) * 0.5
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            if i % policy_freq == 0:
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # update target
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)




In [None]:
# in the submission please use seed 42 for verification
seed = 42
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

# logging variables
ep_reward = 0
reward_list = []
plot_data = []
log_f = open("agent-log.txt","w+")

# initialise agent
agent = Agent()
replay_buffer = ReplayBuffer()
max_episodes = 1000
max_timesteps = 2000

In [None]:
for episode in range(1, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
# train for longer for BP-Hardcore (~1500 episodes?)
        if episode < 20:
            action = env.action_space.sample()
        else:
            action = agent.select_action(state)
            # add noise to action
            action = (action + np.random.normal(0, max_action * 0.1, size=act_dim)).clip(-max_action, max_action)
      
        next_state, reward, done, _ = env.step(action)
        ep_reward += reward

        # decrease fall down reward 
        if reward == -100:
            reward = -1
        # store data in replay buffer
        replay_buffer.add(state, action, reward, next_state, done)
        state = next_state
        
        if len(replay_buffer._storage) >= batch_size:
            agent.train(replay_buffer)

        if done or t == max_timesteps - 1:
            if reward == -1 or ep_reward <= 230:
                agent.train(replay_buffer, t)

    # append the episode reward to the reward list
    reward_list.append(ep_reward)

    # do NOT change this logging code - it is used for automated marking!
    log_f.write('episode: {}, reward: {}\n'.format(episode, ep_reward))
    log_f.flush()
    ep_reward = 0
    
    # print reward data every so often - add a graph like this in your report
    if episode % plot_interval == 0:
        plot_data.append([episode, np.array(reward_list).mean(), np.array(reward_list).std()])
        reward_list = []
        # plt.rcParams['figure.dpi'] = 100
        plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], '-', color='tab:grey')
        plt.fill_between([x[0] for x in plot_data], [x[1]-x[2] for x in plot_data], [x[1]+x[2] for x in plot_data], alpha=0.2, color='tab:grey')
        plt.xlabel('Episode number')
        plt.ylabel('Episode reward')
        plt.show()
        disp.clear_output(wait=True)
      