# Deep Deterministic Policy Gradients (DDPG)
---
In this notebook, we train DDPG with OpenAI Gym's BipedalWalker-v2 environment.

### 1. Import the Necessary Packages

In [14]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from ddpg_agent import Agent

# imports for rendering outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

### 2. Instantiate the Environment and Agent

In [15]:
env = gym.make('BipedalWalker-v2')
env.seed(10)
agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], random_seed=10)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [16]:
print ("action space: {} ... state space: {}".format(env.action_space,env.observation_space))

action space: Box(4,) ... state space: Box(24,)


In [17]:
# lets play a random episode

state = env.reset()
done = False
step = 0
while (not done):
    action = env.action_space.sample()
    next_state,reward,done,_= env.step(action)
    
    print ("step: {} ... state: {} ... action: {} ... reward: {:.2f} ... done: {}\n".format(step,state[:4],
                                                                                      action,reward,done))  
    state = next_state
    step+=1

step: 0 ... state: [ 2.74737482e-03 -1.79915968e-05  1.39956169e-03 -1.59998775e-02] ... action: [ 0.40595362 -0.3622771   0.6946293   0.97321165] ... reward: -0.12 ... done: False

step: 1 ... state: [-0.01247088 -0.03697738 -0.01355848  0.03191381] ... action: [0.593582   0.21886176 0.9732597  0.78010404] ... reward: -0.39 ... done: False

step: 2 ... state: [-0.0618443  -0.084644   -0.04185171  0.0156803 ] ... action: [ 0.6312251   0.4589462  -0.86984366  0.8536051 ] ... reward: -0.30 ... done: False

step: 3 ... state: [-0.10017376 -0.07562135 -0.03607878  0.00321356] ... action: [-0.9024214   0.83865124 -0.03279166  0.9275307 ] ... reward: -0.20 ... done: False

step: 4 ... state: [-0.1219469  -0.04162761 -0.0187642  -0.00818219] ... action: [-0.87667    -0.09966582  0.7502817  -0.28533426] ... reward: -0.18 ... done: False

step: 5 ... state: [-0.14174175 -0.03500372 -0.02818677  0.03679745] ... action: [ 0.88483864 -0.9434231  -0.2572696  -0.9368853 ] ... reward: -0.25 ... done:

### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [18]:
def ddpg(n_episodes=2000, max_t=700):
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
    return scores

scores = ddpg()



Episode 100	Average Score: -90.63	Score: -52.264
Episode 200	Average Score: -80.32	Score: -51.931
Episode 300	Average Score: -93.46	Score: -102.60
Episode 379	Average Score: -91.77	Score: -101.87

KeyboardInterrupt: 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

### 4. Watch a Smart Agent!

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [3]:
# function to animate a list of frames
def animate_frames(frames):
    plt.figure(dpi = 72)
    plt.axis('off')

    # color option for plotting
    # use Greys for greyscale
    cmap = None if len(frames[0].shape)==3 else 'Greys'
    patch = plt.imshow(frames[0], cmap=cmap)  

    fanim = animation.FuncAnimation(plt.gcf(), \
        lambda x: patch.set_data(frames[x]), frames = len(frames), interval=30)
    
    display(display_animation(fanim, default_mode='once'))

In [4]:
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

frames = []
state = env.reset()
agent.reset()
for t in range(1000):
    action = agent.act(state)
    frames.append(env.render(mode='rgb_array')) 
    next_state, reward, done, _ = env.step(action)
    state=next_state
    if done:
        break

env.close()
animate_frames(frames)



ValueError: invalid literal for int() with base 10: ''