In [1]:
import sys; sys.path.append('..') # add project root to the python path

In [2]:
import gym
import torch

from src.part3.MLP import MultiLayerPerceptron as MLP
from src.part4.ActorCritic import TDActorCritic
from src.common.train_utils import EMAMeter, to_tensor

In [3]:
env = gym.make('CartPole-v1')
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n



In [4]:
policy_net = MLP(s_dim, a_dim, [128])
value_net = MLP(s_dim, 1, [128])

agent = TDActorCritic(policy_net, value_net)
ema = EMAMeter()

In [6]:
n_eps = 10000
print_every = 500

for ep in range(n_eps):
    s = env.reset()
    cum_r = 0

    while True:
        s = to_tensor(s, size=(1, 4))
        a = agent.get_action(s)
        ns, r, done, info = env.step(a.item())
        
        ns = to_tensor(ns, size=(1,4))
        agent.update(s, a.view(-1,1), r, ns, done)
        
        s = ns.numpy()
        cum_r += r
        if done:
            break

    ema.update(cum_r)
    if ep % print_every == 0:
        print("Episode {} || EMA: {} ".format(ep, ema.s))

Episode 0 || EMA: 15.0 
Episode 500 || EMA: 14.740729513782314 
Episode 1000 || EMA: 71.29630030249417 
Episode 1500 || EMA: 94.97525109548101 
Episode 2000 || EMA: 39.13963542750589 
Episode 2500 || EMA: 53.68169916203771 
Episode 3000 || EMA: 155.9845074993321 
Episode 3500 || EMA: 26.210668730810067 
Episode 4000 || EMA: 17.22734178837546 
Episode 4500 || EMA: 86.7583548005704 
Episode 5000 || EMA: 9.71585051897232 
Episode 5500 || EMA: 104.7753527800968 
Episode 6000 || EMA: 9.521699917472723 
Episode 6500 || EMA: 9.04936570210145 
Episode 7000 || EMA: 10.231812839064318 
Episode 7500 || EMA: 52.183884014729365 
Episode 8000 || EMA: 9.057574187224324 
Episode 8500 || EMA: 65.04465573021452 
Episode 9000 || EMA: 68.72342462901513 
Episode 9500 || EMA: 106.56567032821476 
