In [4]:
import gym 
import sys
import numpy as np
import torch.optim as optim
import torch.nn as nn

from helpers import NormalizedEnv
from heuristic import HeuristicPendulumAgent
from qnetwork import *
from buffer import *

In [5]:
# Set up
env = NormalizedEnv(gym.make("Pendulum-v1"))

# fixed torque 
torque = env.action(env.action_space.sample())
print("The torque value is : ", torque)

agent = HeuristicPendulumAgent(env, torque)

buffer_size = 10000
learning_rate = 1e-4
gamma = 0.99

buffer = ReplayBuffer(buffer_size)
batch_size = 128

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
hidden_size = 32 

critic = QNetwork(num_states + num_actions, hidden_size, num_actions)

optimizer = optim.Adam(critic.parameters(), lr=1e-4)
criterion = nn.MSELoss()

critic_losses = []
rewards = []

The torque value is :  [1.2967964]


In [6]:
############################# TRAINING ######################

for episode in range(1000):
    state, info = env.reset()
    
    episode_reward = 0
    episode_critic_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state)
        
        next_state, reward, terminated, trunc, info = env.step(action)

        episode_reward += reward
        step_r +=1

        buffer.add(state, action, reward, next_state, trunc)

        if len(buffer) > batch_size:
            transition = buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            loss = update(batch, critic, criterion, agent, optimizer, gamma)

            episode_critic_loss += loss
            step_l +=1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {} \n".format(episode, episode_reward, episode_critic_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)

av_cumulative_reward = sum(rewards)/1000
print("average cumulative reward : ", av_cumulative_reward)

episode: 0, reward: -1.9700245709395077, critic loss: 15.776589340633816 
episode: 100, reward: -2.479352783917291, critic loss: 112.42576375961303 
episode: 200, reward: -2.52223779931, critic loss: 84.46848686218262 
episode: 300, reward: -1.32113439767961, critic loss: 67.96300494194031 
episode: 400, reward: -2.5671284085742507, critic loss: 65.22369734287263 
episode: 500, reward: -3.0596162437159347, critic loss: 67.96183012485504 
episode: 600, reward: -2.4442286291004796, critic loss: 58.969188842773434 
episode: 700, reward: -1.9252787507656919, critic loss: 65.9591827583313 
episode: 800, reward: -1.947855450654292, critic loss: 73.50589000701905 
episode: 900, reward: -2.4730806718767364, critic loss: 72.71744044065476 
episode: 999, reward: -2.410514898055615, critic loss: 77.28852526068687 
average cumulative reward :  -2.18315212360829


In [None]:
import matplotlib.pyplot as plt

############################ PLOT #########################

# Generate x values
x = list(range(1000))
    
y = critic_losses

# Plot the functions
plt.plot(x, y, color='blue', label='loss')

# Add labels and a legend
plt.xlabel('episode')
plt.ylabel('')
plt.legend()

# Display the plot
plt.show()