In [1]:
import gym
import numpy as np
import sys

from helpers import *
from ddpg import * 
from noise import *

In [2]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))

buffer_size = 100000
batch_size = 128

learning_rate = 1e-4
gamma = 0.99

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma)

sigma = 0.3
noise = GaussianActionNoise(sigma)

critic_losses = []
actor_losses = []
rewards = []

In [3]:
################################ TRAINING ########################3

for episode in range(1000):
    state = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    done = False
    
    while not done:
        action = agent.compute_action(state, noise)
        
        next_state, reward, done, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 

        done = torch.FloatTensor([done]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, done)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss

        state = next_state

        if done:
            if ((episode % 100 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)
    
av_cumulative_reward = sum(rewards)/1000
print("The average cumulative reward is : ", av_cumulative_reward)

  state = torch.FloatTensor([state])


episode: 0, reward: -1061.3472003279483, critic loss: 3907.037368774414, actor loss: -7.044386601424776 
episode: 1, reward: -1595.3777769667756, critic loss: 10603.393867492676, actor loss: 75.3234257758595 
episode: 2, reward: -1573.252284298897, critic loss: 11764.268447875977, actor loss: 272.28913444280624 
episode: 3, reward: -1810.227986501498, critic loss: 12593.20022201538, actor loss: 586.0421233177185 
episode: 4, reward: -1212.480718829621, critic loss: 12446.043380737305, actor loss: 1141.007016658783 
episode: 5, reward: -1555.846245540071, critic loss: 11660.412601470947, actor loss: 2359.6718678474426 
episode: 6, reward: -1592.9987516059455, critic loss: 12469.273220062256, actor loss: 4800.265966415405 
episode: 7, reward: -1580.2687254468447, critic loss: 15141.414451599121, actor loss: 8321.895462036133 
episode: 8, reward: -1700.061520537493, critic loss: 17769.71611404419, actor loss: 11714.716857910156 
episode: 9, reward: -1765.714063903594, critic loss: 20194.3

episode: 78, reward: -1229.4809247751023, critic loss: 159675.64110565186, actor loss: 70809.0348815918 
episode: 79, reward: -1325.1372688210433, critic loss: 133189.3716430664, actor loss: 71135.3003540039 
episode: 80, reward: -1036.9311606415797, critic loss: 134606.34358596802, actor loss: 71204.9397277832 
episode: 81, reward: -1341.7076490234588, critic loss: 135867.48301315308, actor loss: 71533.26712036133 
episode: 82, reward: -910.9771878388966, critic loss: 138106.10370254517, actor loss: 71590.24151611328 
episode: 83, reward: -1383.5292236789887, critic loss: 116101.5077495575, actor loss: 71945.32028198242 
episode: 84, reward: -1343.0385778369732, critic loss: 127570.91900062561, actor loss: 72452.48861694336 
episode: 85, reward: -1499.0859024227675, critic loss: 134641.34035491943, actor loss: 72368.72259521484 
episode: 86, reward: -1445.5324274306324, critic loss: 139481.14944076538, actor loss: 72607.94079589844 
episode: 87, reward: -1333.7331195585496, critic los

episode: 156, reward: -1168.8444740189848, critic loss: 161570.79812049866, actor loss: 79798.9772644043 
episode: 157, reward: -1135.8370958222183, critic loss: 142903.48265266418, actor loss: 79708.86450195312 
episode: 158, reward: -1136.1154547058666, critic loss: 162366.09494781494, actor loss: 79991.33923339844 
episode: 159, reward: -1352.2515742196556, critic loss: 172390.49078941345, actor loss: 79333.54138183594 
episode: 160, reward: -1254.0400239828439, critic loss: 182633.08765792847, actor loss: 78786.93151855469 
episode: 161, reward: -1224.5993369426217, critic loss: 144066.1213502884, actor loss: 78855.56317138672 
episode: 162, reward: -1175.599645847853, critic loss: 170406.1681985855, actor loss: 78621.37503051758 
episode: 163, reward: -1286.4012130142532, critic loss: 154549.69977855682, actor loss: 78327.43380737305 
episode: 164, reward: -1141.9483989065673, critic loss: 140719.64797782898, actor loss: 78656.81018066406 
episode: 165, reward: -1238.4948490497266

episode: 234, reward: -1168.6778611189573, critic loss: 144694.0637664795, actor loss: 76437.90487670898 
episode: 235, reward: -867.5568483809691, critic loss: 170178.11897468567, actor loss: 75876.56414794922 
episode: 236, reward: -885.7705613236574, critic loss: 149876.78935146332, actor loss: 75508.44595336914 
episode: 237, reward: -1217.022071964436, critic loss: 133969.8404865265, actor loss: 75157.18743896484 
episode: 238, reward: -996.0424132640961, critic loss: 148803.5731163025, actor loss: 75300.38516235352 
episode: 239, reward: -891.7312581921194, critic loss: 153888.05719661713, actor loss: 74988.37521362305 
episode: 240, reward: -1063.0443761011768, critic loss: 144186.36300849915, actor loss: 74564.2409362793 
episode: 241, reward: -882.11209953435, critic loss: 110609.33958816528, actor loss: 75032.32229614258 
episode: 242, reward: -1113.5707204647133, critic loss: 143872.68176078796, actor loss: 75476.85775756836 
episode: 243, reward: -1031.7942595819661, critic

episode: 312, reward: -909.567516493601, critic loss: 105668.57715797424, actor loss: 67802.19534301758 
episode: 313, reward: -990.861262963917, critic loss: 99120.46525287628, actor loss: 67819.6019897461 
episode: 314, reward: -898.607163207253, critic loss: 109993.59652805328, actor loss: 68168.73303222656 
episode: 315, reward: -776.3171462625987, critic loss: 111402.89587783813, actor loss: 67987.9977722168 
episode: 316, reward: -828.3039436489122, critic loss: 122789.51033878326, actor loss: 67654.76940917969 
episode: 317, reward: -934.5172279873934, critic loss: 107016.53075790405, actor loss: 67782.93927001953 
episode: 318, reward: -935.9097961670436, critic loss: 109047.22857952118, actor loss: 67384.37982177734 
episode: 319, reward: -994.1985846990383, critic loss: 120587.64915561676, actor loss: 66918.89047241211 
episode: 320, reward: -1020.3999912978612, critic loss: 108218.5230588913, actor loss: 66760.14868164062 
episode: 321, reward: -1024.6078271376734, critic lo

episode: 390, reward: -903.3568928539486, critic loss: 76243.95846939087, actor loss: 57556.613037109375 
episode: 391, reward: -901.0846490091079, critic loss: 87904.43930912018, actor loss: 57752.001373291016 
episode: 392, reward: -904.0784567840226, critic loss: 89074.11282253265, actor loss: 57470.80078125 
episode: 393, reward: -893.8798657764657, critic loss: 81549.87708330154, actor loss: 57530.80630493164 
episode: 394, reward: -912.7825010709394, critic loss: 88692.18004465103, actor loss: 57510.97265625 
episode: 395, reward: -904.738767090371, critic loss: 83473.68430519104, actor loss: 57180.43441772461 
episode: 396, reward: -772.4488341547273, critic loss: 76301.47520923615, actor loss: 57260.445251464844 
episode: 397, reward: -773.9575370583939, critic loss: 88601.97541189194, actor loss: 57349.43151855469 
episode: 398, reward: -879.8588118727562, critic loss: 78381.63186073303, actor loss: 57160.019104003906 
episode: 399, reward: -791.7671479474535, critic loss: 720

episode: 469, reward: -900.6810115456834, critic loss: 69537.47943973541, actor loss: 55131.67346191406 
episode: 470, reward: -918.296335724496, critic loss: 78162.03076982498, actor loss: 54765.381103515625 
episode: 471, reward: -804.8850752770169, critic loss: 73096.02474403381, actor loss: 54783.0419921875 
episode: 472, reward: -911.7057658010085, critic loss: 80222.23131608963, actor loss: 54613.51965332031 
episode: 473, reward: -901.3803236033627, critic loss: 74958.45016384125, actor loss: 54290.88198852539 
episode: 474, reward: -791.5268539348993, critic loss: 71259.58676242828, actor loss: 54243.30798339844 
episode: 475, reward: -1104.1905680278978, critic loss: 69806.99566459656, actor loss: 54443.23580932617 
episode: 476, reward: -781.6968127032974, critic loss: 78082.55187988281, actor loss: 54333.38961791992 
episode: 477, reward: -805.6463208514662, critic loss: 65793.65140628815, actor loss: 54118.01565551758 
episode: 478, reward: -906.3300078696863, critic loss: 

episode: 547, reward: -696.1547848448903, critic loss: 62829.39659690857, actor loss: 51528.16940307617 
episode: 548, reward: -877.5759001327574, critic loss: 57304.64739561081, actor loss: 51585.0059967041 
episode: 549, reward: -760.6763400253587, critic loss: 73346.13238358498, actor loss: 51258.49475097656 
episode: 550, reward: -773.2631039769956, critic loss: 60823.641589164734, actor loss: 50746.04655456543 
episode: 551, reward: -891.7324416799216, critic loss: 64208.72486400604, actor loss: 50892.577697753906 
episode: 552, reward: -777.8862273492913, critic loss: 63253.96925354004, actor loss: 50531.63247680664 
episode: 553, reward: -853.5532733687287, critic loss: 53298.498544216156, actor loss: 50704.314849853516 
episode: 554, reward: -770.2689716326105, critic loss: 65197.92904353142, actor loss: 50666.86436462402 
episode: 555, reward: -813.147518321374, critic loss: 63732.61551785469, actor loss: 50479.514892578125 
episode: 556, reward: -862.0371380254421, critic los

episode: 625, reward: -770.1035886607508, critic loss: 51669.039559841156, actor loss: 47441.212478637695 
episode: 626, reward: -837.0782508849655, critic loss: 55766.510796785355, actor loss: 47520.07029724121 
episode: 627, reward: -791.1740888067181, critic loss: 53716.334141254425, actor loss: 47433.99932861328 
episode: 628, reward: -878.3258113496006, critic loss: 41373.611206531525, actor loss: 47784.196350097656 
episode: 629, reward: -896.3827725143876, critic loss: 48600.213686943054, actor loss: 48256.21398925781 
episode: 630, reward: -777.1997974704183, critic loss: 58366.225888967514, actor loss: 47995.815979003906 
episode: 631, reward: -793.2033782365432, critic loss: 60591.1123790741, actor loss: 47846.25686645508 
episode: 632, reward: -764.8280715132704, critic loss: 63628.1691365242, actor loss: 47282.18933105469 
episode: 633, reward: -772.7978724373344, critic loss: 44343.66569828987, actor loss: 47089.699630737305 
episode: 634, reward: -899.2682225045886, criti

episode: 703, reward: -646.000536066997, critic loss: 45137.78846979141, actor loss: 46920.856216430664 
episode: 704, reward: -826.2085896143535, critic loss: 55543.054503679276, actor loss: 46686.87748718262 
episode: 705, reward: -668.0217844072614, critic loss: 54486.48051452637, actor loss: 46053.64041137695 
episode: 706, reward: -768.172216730486, critic loss: 49153.18665146828, actor loss: 46049.80303955078 
episode: 707, reward: -724.7985556918438, critic loss: 58290.4957408905, actor loss: 45938.516860961914 
episode: 708, reward: -527.6334908040388, critic loss: 52615.02257728577, actor loss: 45617.58403015137 
episode: 709, reward: -647.4198774095688, critic loss: 46257.23447728157, actor loss: 45484.926834106445 
episode: 710, reward: -766.0727344309632, critic loss: 47348.84269952774, actor loss: 45841.03770446777 
episode: 711, reward: -623.8702317324661, critic loss: 51846.284727334976, actor loss: 45669.47666931152 
episode: 712, reward: -766.5009439728917, critic loss

episode: 781, reward: -380.5414503210938, critic loss: 48801.3394510746, actor loss: 45211.19136047363 
episode: 782, reward: -510.178209575986, critic loss: 52013.32999587059, actor loss: 45131.283782958984 
episode: 783, reward: -617.0866302373423, critic loss: 54868.91322469711, actor loss: 44780.62045288086 
episode: 784, reward: -383.113207816582, critic loss: 43300.23482275009, actor loss: 44980.52502441406 
episode: 785, reward: -265.39791230202235, critic loss: 46743.50488567352, actor loss: 45029.9875793457 
episode: 786, reward: -629.5190951962155, critic loss: 46780.62627005577, actor loss: 45204.68469238281 
episode: 787, reward: -598.0099929504877, critic loss: 43132.71239280701, actor loss: 45429.90885925293 
episode: 788, reward: -839.4205184651923, critic loss: 49573.49415445328, actor loss: 45529.529037475586 
episode: 789, reward: -387.2098715890212, critic loss: 53772.810131549835, actor loss: 45205.94515991211 
episode: 790, reward: -500.7044549608636, critic loss: 

episode: 859, reward: -123.6618546247359, critic loss: 46754.31672048569, actor loss: 42610.359375 
episode: 860, reward: -376.5316777102488, critic loss: 40732.468566417694, actor loss: 42186.05355834961 
episode: 861, reward: -1564.2051082995772, critic loss: 51801.142275333405, actor loss: 42139.963943481445 
episode: 862, reward: -5.373106767858887, critic loss: 44022.57027721405, actor loss: 42284.99676513672 
episode: 863, reward: -1601.4450042452545, critic loss: 42980.75638961792, actor loss: 41997.90492248535 
episode: 864, reward: -1532.405016289413, critic loss: 41439.47420310974, actor loss: 42225.191665649414 
episode: 865, reward: -124.24344089537239, critic loss: 46085.25793361664, actor loss: 42346.771911621094 
episode: 866, reward: -6.2752544737727645, critic loss: 45585.00903701782, actor loss: 42171.19960021973 
episode: 867, reward: -133.19669942731915, critic loss: 50871.462040901184, actor loss: 41250.294357299805 
episode: 868, reward: -1549.0041156843097, criti

episode: 937, reward: -579.310112798507, critic loss: 27865.654230117798, actor loss: 30020.01512145996 
episode: 938, reward: -122.85024584190188, critic loss: 26190.76005268097, actor loss: 29385.674850463867 
episode: 939, reward: -527.4652297613391, critic loss: 25668.582956314087, actor loss: 29431.38575744629 
episode: 940, reward: -260.8463615752246, critic loss: 27233.974325180054, actor loss: 28920.062576293945 
episode: 941, reward: -257.9966427214005, critic loss: 26127.520040512085, actor loss: 28604.054244995117 
episode: 942, reward: -134.44548382118052, critic loss: 26924.075236320496, actor loss: 28460.060104370117 
episode: 943, reward: -256.5310012870536, critic loss: 26030.34862136841, actor loss: 28027.839233398438 
episode: 944, reward: -556.9179959108005, critic loss: 22334.578568458557, actor loss: 27553.268409729004 
episode: 945, reward: -131.0437439898466, critic loss: 25724.37172317505, actor loss: 27377.579315185547 
episode: 946, reward: -651.9650615117082,

In [1]:
import matplotlib.pyplot as plt

############################ PLOT ##########################

# Generate x values
x = list(range(1000))

y2 = critic_losses
y3 = actor_losses

# Plot the functions
plt.plot(x, y2, color='red', label='critic loss')
plt.plot(x, y3, color='blue', label='actor loss')

# Add labels and a legend
plt.xlabel('episode')
plt.ylabel('')
plt.legend()

# Display the plot
plt.show()

NameError: name 'rewards' is not defined

In [None]:
################################ TESTING ###################################

critic_losses = []
actor_losses = []
rewards = []

for episode in range(100):
    state = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    done = False
    
    while not done:
        action = agent.compute_action(state, noise, deterministic=False)
        
        next_state, reward, done, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 

        done = torch.FloatTensor([done]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, done)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss

        state = next_state

        if done:
            if ((episode % 100 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

av_cumulative_reward = sum(rewards)/100
print("The average cumulative reward is : ", av_cumulative_reward)