## 1. Setting up the environment

In [1]:
import numpy as np
import gym

#pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
# setting manual seed
torch.manual_seed(0)

from unityagents import UnityEnvironment

#matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# imports for rendering outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
env = UnityEnvironment(file_name='unity_envs/Crawler_Linux_NoVis/Crawler.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 12
Size of each action: 20
There are 12 agents. Each observes a state with length: 129
The state for the first agent looks like: [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.25000000e+00
  1.00000000e+00  0.00000000e+00  1.78813934e-07  0.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  6.06093168e-01 -1.42857209e-01 -6.06078804e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.33339906e+00 -1.42857209e-01
 -1.33341408e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -6.0609

In [5]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents) # initialize the score (for each agent)
step=0
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    step+=1
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: 0.5538510533903415


## 2. Defining the policy

In [6]:
# defining the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print ("using",device)

using cuda:0


In [7]:
state_size = state_size
action_size = action_size

# define actor critic network
class ActorCritic(nn.Module):
    
    def __init__(self,state_size,action_size,action_high,action_low,hidden_size=32):
        super(ActorCritic, self).__init__()
        
        # action range
        self.action_high = torch.tensor(action_high).to(device)
        self.action_low = torch.tensor(action_low).to(device)
        
        self.std = nn.Parameter(torch.zeros(action_size))
        
        # common network
        self.fc1 = nn.Linear(state_size,1024)
        
        # actor network
        self.fc2_actor = nn.Linear(1024,256)
        self.fc3_action = nn.Linear(256,action_size)
        #self.fc3_std = nn.Linear(64,action_size)
        
        # critic network
        self.fc2_critic = nn.Linear(1024,256)
        self.fc3_critic = nn.Linear(256,1)
    
    def forward(self,state):
        # common network
        x = F.relu(self.fc1(state))
        
        # actor network
        x_actor = F.relu(self.fc2_actor(x))
        action_mean = F.sigmoid(self.fc3_action(x_actor))
        ## rescale action mean
        action_mean_ = (self.action_high-self.action_low)*action_mean + self.action_low
        #action_std = F.sigmoid(self.fc3_std(x_actor))
        
        # critic network
        x_critic = F.relu(self.fc2_critic(x))
        v = self.fc3_critic(x_critic)
        return action_mean_,v
    
    def act(self,state):
        # converting state from numpy array to pytorch tensor on the "device"
        state = torch.from_numpy(state).float().to(device)
        action_mean,v = self.forward(state)
        prob_dist = Normal(action_mean,F.softplus(self.std))
        action = prob_dist.sample()
        log_prob = prob_dist.log_prob(action)
        return action.cpu().numpy(),torch.sum(log_prob,dim=1),v.squeeze()

## 3. Defining the RL agent

In [8]:
from collections import deque
from itertools import accumulate

def compute_future_rewards(rewards,gamma):
    future_rewards = np.zeros_like(rewards)
    discounted_rewards = np.zeros(rewards.shape[0])
    
    for time_step in range(future_rewards.shape[1]-1,-1,-1):
        future_rewards[:,time_step] = rewards[:,time_step] + gamma*discounted_rewards
        discounted_rewards = future_rewards[:,time_step]
    return future_rewards

class Agent:
    
    def __init__(self,env,learning_rate=1e-3):
        self.env = env
        nS = state_size
        nA = action_size
        action_low = -1
        action_high = 1
        self.policy = ActorCritic(state_size=nS,hidden_size=128,action_size=nA,
                             action_low=action_low,action_high=action_high).to(device)
        self.optimizer = optim.RMSprop(self.policy.parameters(), lr=learning_rate)
        
    def train(self,max_opt_steps=1000,num_trajectories=12,horizon=1000,gamma=.99,target_score= -250,
              PRINT_EVERY=100):
        # store eps scores
        scores = []
        scores_window = deque(maxlen=100)
        
        for opt_step in range(1,max_opt_steps+1):
            rewards = np.zeros([num_trajectories,horizon])
            log_probs = torch.zeros([num_trajectories,horizon],dtype=torch.double,device=device)
            value_estimate = torch.zeros([num_trajectories,horizon],dtype=torch.double,device=device)
            
            for traj_count in range(1):
                # reset state
                env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
                state = env_info.vector_observations                   # get the current state (for each agent)
            
                # play an episode
                for t in range(horizon): 
                    action,log_prob,v = self.policy.act(state)
                    env_info = env.step(action)[brain_name]           # send all actions to tne environment
                    next_state = env_info.vector_observations         # get next state (for each agent)
                    reward = env_info.rewards                         # get reward (for each agent)
                    done = env_info.local_done                        # see if episode finished

                    # update state
                    state = next_state
                    log_probs[:,t] = log_prob
                    rewards[:,t] = reward
                    value_estimate[:,t] = v
                    
                    # break if done
                    if np.any(done):
                        break
            
            # compute advantage estimate to reduce variance
            future_rewards = compute_future_rewards(rewards,gamma)
            future_rewards = torch.from_numpy(future_rewards).double().to(device)
            # b = future_rewards.mean(axis=0)
            # A = (future_rewards - b)/future_rewards.std(axis=0)
            # A = torch.from_numpy(A).double().to(device)
            
            A = future_rewards-value_estimate
            
            # compute loss and applying gradient
            actor_loss = torch.sum(-log_probs*A)/(num_trajectories*horizon)
            
            undiscounted_future_rewards = compute_future_rewards(rewards,gamma=1.0)
            undiscounted_future_rewards = torch.from_numpy(undiscounted_future_rewards).double().to(device)
            critic_loss = torch.sum((undiscounted_future_rewards-value_estimate)**2)/(num_trajectories*horizon)
            
            # total loss
            loss = actor_loss + critic_loss
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            trajectory_total_rewards = rewards.sum(axis=1)
            # update scores and score_window
            scores.append(np.mean(trajectory_total_rewards))
            scores_window.append(np.mean(trajectory_total_rewards))
            
            #printing progress
            if opt_step % PRINT_EVERY == 0:
                print ("Episode: {}\t Avg reward: {:.2f}\t std: {}".format(opt_step,np.mean(scores_window),
                                                                             self.policy.std))
                # save the policy
                torch.save(agent.policy, 'REINFORCE-crawler.policy')
            
            if np.mean(scores_window)>= target_score:
                print ("Environment solved in {} optimization steps! ... Avg reward : {:.2f}".format(opt_step-100,
                                                                                          np.mean(scores_window)))
                # save the policy
                torch.save(agent.policy, 'REINFORCE-crawler.policy')
                break
                
        return scores

## 4. Training the agent!

In [9]:
# lets define and train our agent
agent = Agent(env=env,learning_rate=1e-4)

In [10]:
scores = agent.train(max_opt_steps=20000,horizon=400,gamma=0.98,target_score=500,PRINT_EVERY=100)



Episode: 100	 Avg reward: 2.87	 std: Parameter containing:
tensor([ 0.0035,  0.0037,  0.0024,  0.0014,  0.0063,  0.0022,  0.0012, -0.0014,
         0.0030,  0.0040, -0.0046, -0.0009, -0.0005,  0.0017, -0.0031, -0.0005,
         0.0011, -0.0023, -0.0025, -0.0059], device='cuda:0',
       requires_grad=True)


  "type " + obj.__name__ + ". It won't be checked "


Episode: 200	 Avg reward: 3.04	 std: Parameter containing:
tensor([ 0.0035,  0.0031,  0.0037,  0.0017,  0.0070,  0.0027,  0.0004, -0.0005,
         0.0036,  0.0042, -0.0047,  0.0003, -0.0006,  0.0013, -0.0025, -0.0003,
         0.0002, -0.0015, -0.0042, -0.0072], device='cuda:0',
       requires_grad=True)
Episode: 300	 Avg reward: 2.88	 std: Parameter containing:
tensor([ 3.4038e-03,  2.7820e-03,  5.2124e-03,  1.2722e-03,  6.7563e-03,
         3.3875e-03,  1.4979e-03, -6.0926e-04,  1.5758e-03,  4.4547e-03,
        -5.0728e-03,  9.8704e-04, -2.6833e-04,  2.3593e-03, -1.4206e-03,
        -1.1209e-03,  5.1197e-05, -8.4204e-04, -4.9734e-03, -7.4102e-03],
       device='cuda:0', requires_grad=True)
Episode: 400	 Avg reward: 3.22	 std: Parameter containing:
tensor([ 4.7329e-03,  2.0595e-03,  2.6435e-03,  2.0144e-03,  6.1986e-03,
         3.1612e-03,  1.1746e-03,  3.2273e-04,  2.9159e-03,  2.7203e-03,
        -5.2430e-03,  2.0272e-03, -3.7592e-04,  1.0826e-03, -1.8982e-05,
        -4.2974e-0

Episode: 2600	 Avg reward: 3.43	 std: Parameter containing:
tensor([-0.0052,  0.0096,  0.0065, -0.0037, -0.0014,  0.0068, -0.0037,  0.0117,
         0.0077, -0.0031, -0.0109,  0.0070,  0.0085,  0.0012, -0.0030,  0.0037,
        -0.0072,  0.0004, -0.0077, -0.0058], device='cuda:0',
       requires_grad=True)
Episode: 2700	 Avg reward: 3.72	 std: Parameter containing:
tensor([-0.0055,  0.0086,  0.0053, -0.0045, -0.0010,  0.0074, -0.0029,  0.0103,
         0.0084, -0.0022, -0.0113,  0.0075,  0.0089,  0.0011, -0.0037,  0.0042,
        -0.0060,  0.0029, -0.0081, -0.0039], device='cuda:0',
       requires_grad=True)
Episode: 2800	 Avg reward: 4.07	 std: Parameter containing:
tensor([-0.0050,  0.0058,  0.0054, -0.0046,  0.0004,  0.0081, -0.0034,  0.0099,
         0.0079, -0.0020, -0.0094,  0.0063,  0.0091,  0.0003, -0.0041,  0.0038,
        -0.0060,  0.0038, -0.0078, -0.0042], device='cuda:0',
       requires_grad=True)
Episode: 2900	 Avg reward: 4.01	 std: Parameter containing:
tensor([-0.00

Episode: 5300	 Avg reward: 4.21	 std: Parameter containing:
tensor([-0.0047,  0.0041, -0.0087,  0.0035,  0.0125,  0.0138, -0.0117,  0.0079,
         0.0119, -0.0011, -0.0120,  0.0058,  0.0177, -0.0003, -0.0005,  0.0060,
        -0.0117,  0.0005, -0.0169, -0.0097], device='cuda:0',
       requires_grad=True)
Episode: 5400	 Avg reward: 3.73	 std: Parameter containing:
tensor([-0.0047,  0.0063, -0.0088,  0.0046,  0.0114,  0.0151, -0.0115,  0.0081,
         0.0122, -0.0022, -0.0120,  0.0080,  0.0185,  0.0013, -0.0017,  0.0085,
        -0.0108,  0.0015, -0.0188, -0.0101], device='cuda:0',
       requires_grad=True)
Episode: 5500	 Avg reward: 5.13	 std: Parameter containing:
tensor([-0.0028,  0.0051, -0.0087,  0.0047,  0.0128,  0.0143, -0.0113,  0.0067,
         0.0118,  0.0003, -0.0108,  0.0083,  0.0160,  0.0022, -0.0027,  0.0075,
        -0.0093,  0.0025, -0.0169, -0.0112], device='cuda:0',
       requires_grad=True)
Episode: 5600	 Avg reward: 5.41	 std: Parameter containing:
tensor([-0.00

Episode: 7900	 Avg reward: 3.61	 std: Parameter containing:
tensor([-0.0020, -0.0105, -0.0054,  0.0133,  0.0087,  0.0130, -0.0135,  0.0081,
         0.0087, -0.0017, -0.0184,  0.0124,  0.0190,  0.0080, -0.0097,  0.0118,
        -0.0055,  0.0022, -0.0203, -0.0082], device='cuda:0',
       requires_grad=True)
Episode: 8000	 Avg reward: 1.94	 std: Parameter containing:
tensor([-0.0025, -0.0108, -0.0060,  0.0142,  0.0092,  0.0150, -0.0138,  0.0068,
         0.0097, -0.0014, -0.0174,  0.0121,  0.0188,  0.0081, -0.0095,  0.0117,
        -0.0066,  0.0027, -0.0216, -0.0087], device='cuda:0',
       requires_grad=True)
Episode: 8100	 Avg reward: 2.30	 std: Parameter containing:
tensor([-0.0023, -0.0118, -0.0065,  0.0129,  0.0109,  0.0176, -0.0126,  0.0071,
         0.0085, -0.0032, -0.0175,  0.0114,  0.0181,  0.0074, -0.0094,  0.0109,
        -0.0072,  0.0025, -0.0213, -0.0106], device='cuda:0',
       requires_grad=True)


KeyboardInterrupt: 

In [None]:
# plot reward curve over episodes
plt.figure()
plt.plot(scores)
plt.xlabel('Episode #')
plt.ylabel('Total Reward')
plt.show()

In [None]:
scores = agent.train(max_opt_steps=20000,horizon=200,gamma=0.98,target_score=-200,PRINT_EVERY=100)

In [None]:
# plot reward curve over episodes
plt.figure()
plt.plot(scores)
plt.xlabel('Episode #')
plt.ylabel('Total Reward')
plt.show()

In [None]:
scores = agent.train(max_opt_steps=20000,horizon=200,gamma=0.98,target_score=-180,PRINT_EVERY=100)

In [None]:
# plot reward curve over episodes
plt.figure()
plt.plot(scores)
plt.xlabel('Episode #')
plt.ylabel('Total Reward')
plt.show()

In [None]:
scores = agent.train(max_opt_steps=20000,horizon=200,gamma=0.98,target_score=-150,PRINT_EVERY=100)

In [None]:
# plot reward curve over episodes
plt.figure()
plt.plot(scores)
plt.xlabel('Episode #')
plt.ylabel('Total Reward')
plt.show()

In [None]:
scores = agent.train(max_opt_steps=20000,horizon=200,gamma=0.98,target_score=-150,PRINT_EVERY=100)

## 5. Watch the smart agent!

In [None]:
# uncomment this cell to load the trained policy for Pendulum-v0
# load policy
policy =  torch.load('REINFORCE-Pendulum.policy',map_location='cpu')
agent = Agent(env_name='Pendulum-v0')
agent.policy = policy

In [None]:
# function to animate a list of frames
def animate_frames(frames):
    plt.figure(dpi = 72)
    plt.axis('off')

    # color option for plotting
    # use Greys for greyscale
    cmap = None if len(frames[0].shape)==3 else 'Greys'
    patch = plt.imshow(frames[0], cmap=cmap)  

    fanim = animation.FuncAnimation(plt.gcf(), \
        lambda x: patch.set_data(frames[x]), frames = len(frames), interval=30)
    
    display(display_animation(fanim, default_mode='once'))

In [None]:
frames = []
total_reward = 0
state = env.reset()
value = []
r = []
for t in range(2000):
    action, _,v = agent.policy.act(state[np.newaxis,:])
    #frames.append(env.render(mode='rgb_array')) 
    next_state, reward, done, _ = env.step(action[0])
    value.append(v.squeeze())
    r.append(reward)
    state=next_state
    total_reward+= reward
    if done:
        break

print ("Total reward:",total_reward)
env.close()
#animate_frames(frames)

In [None]:
r_ = compute_future_rewards(np.array(r)[np.newaxis,:],gamma=1.0)
plt.plot(r_[0])
plt.plot(value)

In [None]:
agent.policy.std