In [5]:
import gym
import numpy as np
import sys

from helpers import *
from ddpg import * 
from noise import *

In [6]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))

buffer_size = 100000
batch_size = 128

learning_rate = 1e-4
gamma = 0.99

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma)

sigma = 0.3
noise = GaussianActionNoise(sigma)

critic_losses = []
actor_losses = []
rewards = []

In [8]:
################################ TRAINING ########################3

for episode in range(1000):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)
    
av_cumulative_reward = sum(rewards)/1000
print("The average cumulative reward is : ", av_cumulative_reward)

episode: 0, reward: -7.602885286090895, critic loss: 58.372495386335586, actor loss: -0.1940750570760833 
episode: 99, reward: -6.607580538353325, critic loss: 803.4222626590729, actor loss: 416.97652481079103 
episode: 100, reward: -6.5033078911750115, critic loss: 846.5114593791961, actor loss: 416.94042709350583 
episode: 200, reward: -5.980321815771978, critic loss: 665.7339129304886, actor loss: 363.2721070861816 
episode: 300, reward: -4.694779520656778, critic loss: 656.3613982892036, actor loss: 347.7255027770996 
episode: 400, reward: -4.117399739529049, critic loss: 570.5039640951156, actor loss: 315.29410598754885 


KeyboardInterrupt: 

In [1]:
import matplotlib.pyplot as plt

############################ PLOT ##########################

# Generate x values
x = list(range(1000))

y2 = critic_losses
y3 = actor_losses

# Plot the functions
plt.plot(x, y2, color='red', label='critic loss')
plt.plot(x, y3, color='blue', label='actor loss')

# Add labels and a legend
plt.xlabel('episode')
plt.ylabel('')
plt.legend()

# Display the plot
plt.show()

NameError: name 'rewards' is not defined

In [None]:
################################ TESTING ###################################

critic_losses = []
actor_losses = []
rewards = []

for episode in range(100):
    state = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise, deterministic=False)
        
        next_state, reward, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 10 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

av_cumulative_reward = sum(rewards)/100
print("The average cumulative reward is : ", av_cumulative_reward)