In [None]:
import numpy as np
import torch
from collections import deque
import matplotlib.pyplot as plt
import datetime

from unityagents import UnityEnvironment
from model_bi import Actor, Critic
from ddpg_agent_bi import Agent, OUNoise, ReplayBuffer

In [None]:
env = UnityEnvironment(file_name='C:/Users/adamb/OneDrive/Documents/Udacity Reinforcement Learning/3. Policy Base Methods/p2_continous_control/Reacher_Windows_x86_64/Reacher')

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

In [None]:
action_size = brain.vector_action_space_size

In [None]:
states = env_info.vector_observations
state_size = states.shape[1]

In [None]:
agent = Agent(state_size=state_size, 
              action_size=action_size,
              random_seed=42)

In [None]:
##Variables
def ddpg(n_episodes=2000, max_t=1000, deque_len=100, print_every=100):
    
    scores = []
    scores_window = deque(maxlen=deque_len)
    scores_avg = []
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        state = env_info.vector_observations[0] 
        agent.reset()
        score = 0
        
        for t in range(max_t):
            ##Determine next action
            action = agent.act(state)
            
            ##Execute Action
            env_info = env.step(action)[brain_name]  
            
            ##Get Next State after Action
            next_state = env_info.vector_observations[0]
            
            ##Get reward and done status from environment
            reward = env_info.rewards[0]                  
            done = env_info.local_done[0]  
            
            ##Update Agent
            agent.step(state, action, reward, next_state, done)
            
            ##Update Scores and state
            score += reward  
            state = next_state  
            
            ##End episode if done
            if done:
                break
            
        print(datetime.datetime.now())
        print('Score: ' + str(score))
        
        scores_window.append(score)
        scores.append(score)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
        scores_avg.append(np.mean(scores_window))
        print()
        
    return scores, scores_avg

In [None]:
scores, scores_avg = ddpg(n_episodes = 200, max_t = 1000, deque_len = 20, print_every=100)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores_avg)+1), scores_avg)