In [1]:
import sys
import gym 
import numpy as np 
import torch 

from helpers import *
from ddpg_target import *
from noise import *

In [2]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))

state_space = env.observation_space
action_space = env.action_space

buffer_size = 100000
batch_size = 128

learning_rate = 1e-4

gamma = 0.99
tau = 0.01

sigma = 0.3
theta_array = np.linspace(0, 1, 5)
theta = theta_array[0]

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma, tau)

# initialize the noise
noise = OUActionNoise(device, action_space, sigma, theta)

critic_losses = []
actor_losses = []
rewards = []

In [3]:
################### TRAINING ############################

for episode in range(1000):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    noise.reset()
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise)
        
        next_state, reward, terminated, trunc, _ = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

  state = torch.FloatTensor([state]).to(device)


episode: 0, reward: -7.347457658764694, critic loss: 53.75057480070326, actor loss: 0.29876787836352986 
episode: 100, reward: -6.226629789433637, critic loss: 325.40750835418703, actor loss: 277.5001722717285 
episode: 200, reward: -6.222878954560659, critic loss: 541.8415003061294, actor loss: 321.70920181274414 


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

########## PLOT LOSSES ########################

print("for tau = ", tau)
print("for theta = ", theta)

av_cumulative_reward = sum(rewards)/1000

print("average cumulative reward : ", av_cumulative_reward)

# Generate x values
x = list(range(1000))

y2 = critic_losses
y3 = actor_losses

# Plot the functions
plt.plot(x, y2, color='red', label='critic loss')
plt.plot(x, y3, color='blue', label='actor loss')

# Add labels and a legend
plt.xlabel('episode')
plt.ylabel('')
plt.legend()

# Display the plot
plt.show()

In [None]:
##################### TESTING ##########################

critic_losses = []
actor_losses = []
rewards = []

for episode in range(100):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise, deterministic=False)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)
            
            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 10 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

print("for tau = ", tau)
print("for theta = ", theta)

av_cumulative_reward = sum(rewards)/100

print("average cumulative reward : ", av_cumulative_reward)