In [1]:
# -------------------------------- #
# 1. Import the Necessary Packages #
# -------------------------------- #
#import gym
from unityagents import UnityEnvironment
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

#from ddpg_agent import Agent

In [2]:
# ------------------------------ #
# 2. Instantiate the Environment #
# ------------------------------ #
# environment with 1 Agent
env1 = UnityEnvironment(file_name='./Reacher_1.app')
# get the default brain for env1
brain1_name = env1.brain_names[0]
brain1 = env1.brains[brain1_name]

# environment with 20 Agent
#env2 = UnityEnvironment(file_name='./Reacher_20.app')
# get the default brain for env20
#brain20_name = env20.brain_names[0]
#brain20 = env20.brains[brain20_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# -------------------------------------- #
# 3. Examine the State and Action Spaces #
# -------------------------------------- #
# reset the environment
env1_info = env1.reset(train_mode=True)[brain1_name]

# number of agents
num_agent1 = len(env1_info.agents)
print('Number of agent:', num_agent1)

# size of each action
action_size1 = brain1.vector_action_space_size
print('Size of each action:', action_size1)

# examine the state space 
states1 = env1_info.vector_observations
state_size1 = states1.shape[1]
print('There are {} agent. It observes a state with length: {}'.format(states1.shape[0], state_size1))
print('The state for the first agent looks like:', states1[0])

Number of agent: 1
Size of each action: 4
There are 1 agent. It observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [12]:
agent1 = env1_info.agents

In [10]:
def ddpg_train(env, brain_name,
               n_episodes=1000, max_t=300, print_every=100):
    """Train Agent(s) with Deep Deterministic Policy Gradients.
    
    Params
    ======
        env: the environment to train agent(s)
        brain_name: the default brain name of the environment
        n_episodes (int): total episodes
        max_t (int): maximum time steps for each episode
        print_every (int): episode interval to print the scores outcome
    """
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents
    n_agents = len(env_info.agents)
    n_scores_deque = [deque(maxlen=print_every) for i in range(len)]
    n_scores = []

    action_size = env.brains[brain_name].vector_action_space_size
    
    for i_episode in range(1, n_episodes+1):
        # get the current state (for each agent)
        states = env_info.vector_observations
        # initialize the score (for each agent)
        scores = np.zeros(num_agents)
        for t in range(max_t):
            # select an action (for each agent)
            actions = np.random.randn(len(env_info.agents), action_size)
            # send all actions to the environment
            env_info = env.step(actions)[brain_name]
            # get next state (for each agent)
            next_states = env_info.vector_observations
            # get reward (for each agent)
            rewards = env_info.rewards
            # see if episode finished
            dones = env_info.done
            # update the score (for each agent)
            scores += env_info.rewards
            # roll over states to next time step
            states = next_states
            # exit loop if episode finished
            if done:
                break 
        # reset the environment 
        env_info = env.reset(train_mode=False)[brain_name]
        # reset all agents 
        agents.reset()
                
        n_scores_deque.append(scores)
        n_scores.append(scores)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        torch.save(agents.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agents.critic_local.state_dict(), 'checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        
    return scores

In [11]:
scores1 = ddpg(env=env1, agent=agent1,
               n_episodes=100, max_t=20, print_every=10)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

TypeError: expected np.ndarray (got dict)

**Report Catalog**

- 1. Learning Algorithm
    - 1.1 Basic Concepts
    - 1.2 Policy Gradient Methods
    - 1.3 PPO
    - 1.4 A3C
    - 1.5 DDPG
    - 1.6 D4PG
- 2. Plot of Rewards
- 3. Ideas of Future Work


# 1. Learning Algorithm
Try Policy-Based method from simple to complex.

## 1.1 Basic Concepts 
Define the __policy neural network__.
In Policy-Based Methods, the agent use a neural network to aproximate either a stochastic or a deterministic policy.
Here're some basic concepts of Policy:

### On-Policy & Off-Policy

- __On-Policy: π(collecting experience) = π(trained)__
    
    The policy π(collecting experience) used for interacting with the environment is ALSO the policy π(trained).


- __Off-Policy: π(collecting experience) ≠ π(trained)__

    The policy π(collecting experience) used for interacting with the environment is NOT the policy π(trained). 
        - π(collecting experience) is ε-Greedy policy 
        - while π(trained)is the optimal policy.
    
### Stochastic Policy & Deterministic Policy

- __Stochastic Policy__ wants to learn the Probability Distribution over the actions.
- __Deterministic Policy__ beleives that the best action every single time when we inquire the Actor (Policy) nerual network. Thus it always outputs the best believed action for any given state.

### Value-Based and Policy-Based

- __Value-Based Methods__: the agent uses its experience with the environment to maintain an estimate of the optimal action-value function. The optimal policy is then obtained from the optimal action-value function estimate.
```
class Value_Based_Network():
    def __init__():
        ...    
    def forward():
        ...
```
- __Policy-Based Methods__: directly learn the optimal policy, without having to maintain a separate value function estimate.
```
class Policy_Based_Network():
    def __init__():
        ...   
    def forward():
        ...    
    def act():
        ...
```

## 1.2 Policy Gradient Methods

### 1.2 .1 Hill Climbing 
Pseudocode
<img src="CC_imgs/HillClimbing.png">

NOTE: __Episode Return G__ vs. __Expected Return J__

Due to randomness in the environment(and the policy, if it is stochastic), it is highly like that if we collect a second episode with the same values for θ, we'll likely get a different value for the return G.
The (sampled) return **G** is not a perfect but good enough __estimate for the expected return J__.

### 1.2.2 REINFORCE
<img src="./readme_imgs/REINFORCE.png">

## 1.3 [PPO](https://arxiv.org/pdf/1707.06347.pdf)(Proximal Policy Optimization Algorithms)

### 1.3.1 The Surrogate Function

The __importance sampling__ below tells us we can use old trajectories for computing averages for new policy, as long as we add this extra re-weighting factor, that takes into account how under or over–represented each trajectory is under the new policy compared to the old one.
<img src="./readme_imgs/importance_sampling.png">

Expanding the __re-weighting factor__:
<img src="./readme_imgs/re-weighting_factor.png">

The approximate form of the gradient, we can think of it as the gradient of a new object, called the __surrogate function__
<img src="./readme_imgs/surrogate_function.png">
So using this new gradient, we can perform gradient ascent to update our policy -- which can be thought as directly maximize the surrogate function.

### 1.3.2 Clipping Policy Updates

implement the PPO algorithm as well, and the scalar function is given by
$\frac{1}{T}\sum^T_t \min\left\{R_{t}^{\rm future}\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)},R_{t}^{\rm future}{\rm clip}_{\epsilon}\!\left(\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)}\right)\right\}$

the ${\rm clip}_\epsilon$ function is implemented in pytorch as ```torch.clamp(ratio, 1-epsilon, 1+epsilon)```

### 1.3.3 PPO Summary

PPO(Proximal Policy Optimization) algorithm:
<img src="./readme_imgs/PPO_summary.png">

## 1.4 [A3C](https://arxiv.org/pdf/1602.01783.pdf)(Asynchronous Advantage Actor-critic)


## 1.5 DDPG(Deep Deterministic Policy Gradient)

## 1.6 [D4PG](https://openreview.net/pdf?id=SyZipzbCb)(Distributed Distributional Deterministic Policy)

[Reference](https://github.com/ShangtongZhang/DeepRL)

# 2. Plot of Rewards
A plot of rewards per episode is included to illustrate that either:

- __version 1__ the agent receives an average reward (over 100 episodes) of at least +30, or
![1Agent_Plot]<img src="./readme_imgs/1Agent_Plot.png">

- __version 2__ the agent is able to receive an average reward (over 100 episodes, and over all 20 agents) of at least +30.
![20Agents_Plot]<img src="./readme_imgs/20Agents_Plot.png">


# 3. Ideas of Future Work

In [10]:
env.close()