In [1]:
import sys
import gym 
import numpy as np 
import torch 

from helpers import *
from ddpg_target import *
from noise import *

In [2]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))

state_space = env.observation_space
action_space = env.action_space

buffer_size = 100000
batch_size = 128

learning_rate = 1e-4

gamma = 0.99
tau = 0.1

sigma = 0.3
theta = 0.5

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma, tau)

# initialize the noise
noise = OUActionNoise(action_space, sigma, theta)

critic_losses = []
actor_losses = []
rewards = []

In [None]:
for episode in range(1000):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    noise.reset()
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    done = False
    
    while not done:
        action = agent.compute_action(state, noise)
        
        next_state, reward, terminated, done, _ = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 

        done = torch.FloatTensor([done]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, done)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss

        state = next_state

        if done:
            sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

  state = torch.FloatTensor([state]).to(device)


episode: 0, reward: -1597.007467189463, critic loss: 4849.492317199707, actor loss: 0.0731229234370403 
episode: 1, reward: -1301.9490312711494, critic loss: 12146.942470550537, actor loss: 68.23452118039131 
episode: 2, reward: -1338.4531998613074, critic loss: 11387.446269989014, actor loss: 300.5756939649582 
episode: 3, reward: -1564.700801984861, critic loss: 11640.370620727539, actor loss: 1104.6571688652039 
episode: 4, reward: -1233.6433818226842, critic loss: 11590.934520721436, actor loss: 2809.76610660553 
episode: 5, reward: -1719.761489577584, critic loss: 10682.186931610107, actor loss: 4615.098382949829 
episode: 6, reward: -1778.7551381066405, critic loss: 10726.963932037354, actor loss: 6001.420997619629 
episode: 7, reward: -1808.9362928760277, critic loss: 11683.775325775146, actor loss: 7411.070482254028 
episode: 8, reward: -1723.822388220187, critic loss: 12860.779125213623, actor loss: 8921.093269348145 
episode: 9, reward: -1612.4288351960226, critic loss: 14671

episode: 78, reward: -1870.4071551419768, critic loss: 151009.07704162598, actor loss: 71481.96212768555 
episode: 79, reward: -1055.5481449659262, critic loss: 139521.9474220276, actor loss: 72470.26681518555 
episode: 80, reward: -1204.6437184656834, critic loss: 142040.2871284485, actor loss: 73599.21731567383 
episode: 81, reward: -1319.644467984002, critic loss: 141543.02395629883, actor loss: 74386.09396362305 
episode: 82, reward: -1150.1228366092548, critic loss: 152477.9063949585, actor loss: 75018.6782836914 
episode: 83, reward: -1383.9854040051227, critic loss: 166850.02004623413, actor loss: 74919.8508605957 
episode: 84, reward: -1414.802194226423, critic loss: 162985.1018295288, actor loss: 74476.01336669922 
episode: 85, reward: -1056.8582148274843, critic loss: 152308.52354812622, actor loss: 74024.3678894043 
episode: 86, reward: -1391.5388162360805, critic loss: 161746.21997451782, actor loss: 73826.60974121094 
episode: 87, reward: -1258.5003067592447, critic loss: 

episode: 156, reward: -1254.4673687574673, critic loss: 121266.45734786987, actor loss: 74797.42123413086 
episode: 157, reward: -906.0538155395382, critic loss: 158194.883310318, actor loss: 74480.1474609375 
episode: 158, reward: -1245.9559467068173, critic loss: 163482.02451705933, actor loss: 73619.69290161133 
episode: 159, reward: -1215.4360535877422, critic loss: 129361.47767734528, actor loss: 72977.68643188477 
episode: 160, reward: -1148.413517062566, critic loss: 148207.55953121185, actor loss: 72389.74542236328 
episode: 161, reward: -1192.895281998963, critic loss: 128317.50455093384, actor loss: 72198.78134155273 
episode: 162, reward: -1310.5580170106768, critic loss: 126895.69032382965, actor loss: 72540.59957885742 
episode: 163, reward: -1064.7131856578285, critic loss: 147439.08439731598, actor loss: 72566.51440429688 
episode: 164, reward: -1068.0719381925874, critic loss: 125034.62940979004, actor loss: 72456.47171020508 
episode: 165, reward: -1137.7938570161973, 

episode: 234, reward: -1200.9799043351538, critic loss: 116636.85454273224, actor loss: 68483.76086425781 
episode: 235, reward: -423.6479486006967, critic loss: 102178.18971633911, actor loss: 68669.82504272461 
episode: 236, reward: -998.6892608372518, critic loss: 111930.58800315857, actor loss: 68811.22802734375 
episode: 237, reward: -742.7719791193293, critic loss: 98921.67066287994, actor loss: 68974.31945800781 
episode: 238, reward: -1158.8118424606162, critic loss: 108727.69189357758, actor loss: 69068.81719970703 
episode: 239, reward: -1187.4137573554433, critic loss: 120585.03447723389, actor loss: 69245.82205200195 
episode: 240, reward: -1259.3939135453502, critic loss: 125847.6463060379, actor loss: 68805.20974731445 
episode: 241, reward: -1279.7564804490887, critic loss: 123577.36445140839, actor loss: 68868.86016845703 
episode: 242, reward: -863.0725989479596, critic loss: 122296.28190517426, actor loss: 68690.02798461914 
episode: 243, reward: -901.9949826418888, c

episode: 312, reward: -1088.3962221406598, critic loss: 116787.6192111969, actor loss: 65926.45962524414 
episode: 313, reward: -909.3982802083156, critic loss: 100418.6214056015, actor loss: 65767.28665161133 
episode: 314, reward: -1184.865912352425, critic loss: 118432.42221164703, actor loss: 66101.40283203125 
episode: 315, reward: -987.590566279865, critic loss: 109283.49349594116, actor loss: 65719.08303833008 
episode: 316, reward: -1074.4047592096645, critic loss: 119595.43792057037, actor loss: 65431.502349853516 
episode: 317, reward: -1001.2979337723365, critic loss: 129555.37582397461, actor loss: 65024.052825927734 
episode: 318, reward: -901.4635132280039, critic loss: 110198.49335289001, actor loss: 64440.93927001953 
episode: 319, reward: -936.8028750278269, critic loss: 95968.0925693512, actor loss: 64535.117248535156 
episode: 320, reward: -982.7067350679869, critic loss: 101956.09756851196, actor loss: 64824.44299316406 
episode: 321, reward: -1081.4677940641538, cr

episode: 390, reward: -881.8011756806452, critic loss: 92402.11256885529, actor loss: 62856.485931396484 
episode: 391, reward: -1078.9427510512453, critic loss: 91362.23944282532, actor loss: 62866.36196899414 
episode: 392, reward: -1007.5289145687266, critic loss: 110732.21330165863, actor loss: 62857.984466552734 
episode: 393, reward: -895.1110526864161, critic loss: 107869.27603149414, actor loss: 61869.47521972656 
episode: 394, reward: -879.7205007891653, critic loss: 96164.58010292053, actor loss: 61433.80157470703 
episode: 395, reward: -821.5822604577812, critic loss: 94033.15002059937, actor loss: 61515.374450683594 
episode: 396, reward: -905.9611193949518, critic loss: 88292.11199188232, actor loss: 61568.04379272461 
episode: 397, reward: -1016.2547522239761, critic loss: 99842.09003448486, actor loss: 61662.66177368164 
episode: 398, reward: -914.3266143827558, critic loss: 98942.19974327087, actor loss: 61213.09017944336 
episode: 399, reward: -990.8587129604141, criti

episode: 468, reward: -782.3466895390148, critic loss: 79220.82368469238, actor loss: 56880.66082763672 
episode: 469, reward: -890.4340845154281, critic loss: 79227.84017372131, actor loss: 57164.20999145508 
episode: 470, reward: -882.3457376074293, critic loss: 71059.77038955688, actor loss: 57061.095764160156 
episode: 471, reward: -928.1061151436332, critic loss: 95343.98650550842, actor loss: 56626.55569458008 
episode: 472, reward: -971.9836983297348, critic loss: 81709.97657871246, actor loss: 56417.527893066406 
episode: 473, reward: -782.4443234760734, critic loss: 82938.9745016098, actor loss: 56575.655700683594 
episode: 474, reward: -1022.1676440796444, critic loss: 93445.62899494171, actor loss: 56232.14682006836 
episode: 475, reward: -879.7744903826095, critic loss: 72325.60901260376, actor loss: 55869.988189697266 
episode: 476, reward: -861.375301273293, critic loss: 83626.42708396912, actor loss: 56372.644775390625 
episode: 477, reward: -902.1267533808897, critic lo

episode: 546, reward: -767.3944198872248, critic loss: 81085.62589359283, actor loss: 53738.41514587402 
episode: 547, reward: -845.7697168967152, critic loss: 62449.46388530731, actor loss: 53463.69688415527 
episode: 548, reward: -659.3111190161121, critic loss: 76421.52981090546, actor loss: 53258.90090942383 
episode: 549, reward: -763.2442541362863, critic loss: 67171.06031703949, actor loss: 53651.25750732422 
episode: 550, reward: -751.6459106390722, critic loss: 67662.00280952454, actor loss: 53645.96365356445 
episode: 551, reward: -726.2671585108212, critic loss: 68975.83964252472, actor loss: 53556.17077636719 
episode: 552, reward: -842.5677030220834, critic loss: 71209.31632709503, actor loss: 53182.753677368164 
episode: 553, reward: -681.8343284625055, critic loss: 74307.53250408173, actor loss: 52828.85917663574 
episode: 554, reward: -769.972700732134, critic loss: 64981.27066421509, actor loss: 53043.483306884766 
episode: 555, reward: -635.231833234551, critic loss: 

episode: 624, reward: -763.0090933588745, critic loss: 60547.50100040436, actor loss: 48175.53759765625 
episode: 625, reward: -627.2362374755858, critic loss: 59942.78617477417, actor loss: 47730.18817138672 
episode: 626, reward: -1052.5263175147315, critic loss: 60034.70877456665, actor loss: 47793.731994628906 
episode: 627, reward: -635.563549982017, critic loss: 57559.123403549194, actor loss: 47557.192443847656 
episode: 628, reward: -527.4893212344765, critic loss: 52615.0663061142, actor loss: 47779.29545593262 
episode: 629, reward: -635.3119120710296, critic loss: 58270.7058134079, actor loss: 47657.43280029297 
episode: 630, reward: -643.3692281506849, critic loss: 61273.850474357605, actor loss: 47811.724060058594 
episode: 631, reward: -617.40247303337, critic loss: 45425.64869880676, actor loss: 48069.452560424805 
episode: 632, reward: -969.3826597976672, critic loss: 58729.57194852829, actor loss: 47807.74011230469 
episode: 633, reward: -510.5911951666313, critic loss

episode: 702, reward: -261.2979383618762, critic loss: 45673.64775753021, actor loss: 42181.915618896484 
episode: 703, reward: -380.1069629591865, critic loss: 44336.26300239563, actor loss: 42188.5354309082 
episode: 704, reward: -1568.1905769424661, critic loss: 46619.65260219574, actor loss: 42309.763763427734 
episode: 705, reward: -293.9413417509846, critic loss: 43962.12819671631, actor loss: 41780.66441345215 
episode: 706, reward: -497.2784793925966, critic loss: 37225.94400310516, actor loss: 42027.03356933594 
episode: 707, reward: -872.802199114794, critic loss: 44633.36971950531, actor loss: 42514.462478637695 
episode: 708, reward: -252.83055858306886, critic loss: 41546.79971790314, actor loss: 42375.292892456055 
episode: 709, reward: -623.8311525613651, critic loss: 45772.3683385849, actor loss: 42179.877365112305 
episode: 710, reward: -261.8257692746862, critic loss: 46666.8929977417, actor loss: 41258.59390258789 
episode: 711, reward: -263.36731193876847, critic lo

episode: 780, reward: -131.5975705451919, critic loss: 25040.75929069519, actor loss: 31531.29151916504 
episode: 781, reward: -368.86571113155225, critic loss: 30140.925499916077, actor loss: 31122.133102416992 
episode: 782, reward: -255.7821153360372, critic loss: 25989.932251930237, actor loss: 30917.051330566406 
episode: 783, reward: -1425.433022065157, critic loss: 30675.61867904663, actor loss: 30754.28123474121 
episode: 784, reward: -254.85595767705448, critic loss: 23977.46086025238, actor loss: 30229.14697265625 
episode: 785, reward: -1339.5321694181441, critic loss: 23494.719285964966, actor loss: 30238.191955566406 
episode: 786, reward: -387.44290940817535, critic loss: 29009.83162879944, actor loss: 30190.169464111328 
episode: 787, reward: -134.49523890429697, critic loss: 33448.65048980713, actor loss: 30043.19416809082 
episode: 788, reward: -1568.2188725453018, critic loss: 24613.45469379425, actor loss: 29646.449768066406 
episode: 789, reward: -1359.6778533224747

episode: 857, reward: -305.0373809196882, critic loss: 22306.69104576111, actor loss: 24697.814476013184 
episode: 858, reward: -13.233942117212322, critic loss: 25200.291793823242, actor loss: 24863.302207946777 
episode: 859, reward: -375.4227794442015, critic loss: 28105.386310577393, actor loss: 24363.025703430176 
episode: 860, reward: -487.948660061021, critic loss: 25456.003732681274, actor loss: 24150.834503173828 
episode: 861, reward: -505.18568249996684, critic loss: 21886.32899093628, actor loss: 23649.465965270996 
episode: 862, reward: -260.7998658093988, critic loss: 20637.424249649048, actor loss: 23701.998817443848 
episode: 863, reward: -374.82847864706514, critic loss: 24244.7835521698, actor loss: 23256.29850769043 
episode: 864, reward: -379.2831772692574, critic loss: 20788.759214401245, actor loss: 22318.52114868164 
episode: 865, reward: -254.6621482453192, critic loss: 24395.388370513916, actor loss: 22711.59465789795 
episode: 866, reward: -261.20250451997134,

episode: 934, reward: -370.2682510973742, critic loss: 17090.57578277588, actor loss: 11943.967685699463 
episode: 935, reward: -608.5442478179465, critic loss: 15440.340675354004, actor loss: 11824.66767501831 
episode: 936, reward: -380.49574373100705, critic loss: 15232.700902938843, actor loss: 11872.315814971924 
episode: 937, reward: -620.5702252185187, critic loss: 14870.68675994873, actor loss: 11860.855052947998 
episode: 938, reward: -127.30413051844039, critic loss: 15650.14454460144, actor loss: 11566.146469116211 
episode: 939, reward: -370.36287873687013, critic loss: 11266.979331970215, actor loss: 11450.132503509521 
episode: 940, reward: -546.534316421039, critic loss: 12637.283935546875, actor loss: 11611.693367004395 
episode: 941, reward: -230.5180085371335, critic loss: 14737.828275680542, actor loss: 11904.672866821289 
episode: 942, reward: -123.05991712974462, critic loss: 14414.64859199524, actor loss: 11521.107006072998 
episode: 943, reward: -130.155844399533