In [1]:
import sys
import gym 
import numpy as np 
import torch 

from helpers import *
from ddpg_target import *
from noise import *

In [2]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))


buffer_size = 100000
batch_size = 128

learning_rate = 1e-4

gamma = 0.99
tau_array = np.linspace(0.01, 1, 5)
tau = tau_array[4]

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma, tau)

sigma = 0.3
noise = GaussianActionNoise(sigma)

critic_losses = []
actor_losses = []
rewards = []

In [None]:
######################### TRAINING #######################

for episode in range(1000):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

episode: 0, reward: -7.825293518588885, critic loss: 1672.7108584594725, actor loss: 383.2991662597656 
episode: 100, reward: -5.870577554999093, critic loss: 635.656953277588, actor loss: 371.46659484863284 


In [None]:
import matplotlib.pyplot as plt

##################### PLOTS ###########################

print("for tau = ", tau)

av_cumulative_reward = sum(rewards)/1000

print("average cumulative reward : ", av_cumulative_reward)

# Generate x values
x = list(range(1000))

y2 = critic_losses
y3 = actor_losses

# Plot the functions
plt.plot(x, y2, color='red', label='critic loss')
plt.plot(x, y3, color='blue', label='actor loss')

# Add labels and a legend
plt.xlabel('episode')
plt.ylabel('')
plt.legend()

# Display the plot
plt.show()

In [None]:
############################### TESTING ########################

critic_losses = []
actor_losses = []
rewards = []

for episode in range(100):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise, deterministic=False)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 10 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

print("for tau = ", tau)

av_cumulative_reward = sum(rewards)/100

print("average cumulative reward : ", av_cumulative_reward)