In [6]:
# Implementation of reinforce in pytorch. 
# See p. 326 Reinforcement Learning 2nd ed. Sutton & Barto 
# By Andriy Drozdyuk.
import torch
import gym

steps = 10_000
lr = 0.005
γ = 0.9999

# %%
env = gym.make('CartPole-v0')

nn = torch.nn.Sequential(
    torch.nn.Linear(4, 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, env.action_space.n),
    torch.nn.Softmax(dim=-1)
)
optim = torch.optim.Adam(nn.parameters(), lr=lr)

assert isinstance(nn(torch.tensor(env.reset(), dtype=torch.float)), torch.Tensor)

In [7]:
while steps > 0:
    obs = torch.tensor(env.reset(), dtype=torch.float)    
    done = False
    Actions, States, Rewards, EligibilityVector = [], [], [], []
    
    while not done:
        probs = nn(obs)
        c = torch.distributions.Categorical(probs=probs)        
        action = c.sample()        
        log_prob = c.log_prob(action)        
        action = action.item()
        
        assert isinstance(action, int)
        obs_, rew, done, _info = env.step(action)
        steps -= 1
        
        Actions.append(action)
        States.append(obs)
        Rewards.append(rew)
        EligibilityVector.append(log_prob)

        obs = torch.tensor(obs_, dtype=torch.float)

    DiscountedReturns = []
    for t in range(len((Rewards))):
        G = 0.0
        for k, r in enumerate(Rewards[t:]):
            G += (γ**k)*r
        G = (γ**t)*G
        DiscountedReturns.append(G)
    
    EligibilityVector = torch.stack(EligibilityVector)        
    DiscountedReturns = torch.tensor(DiscountedReturns, dtype=torch.float)
    
    assert EligibilityVector.shape == (len(Actions),)
    assert DiscountedReturns.shape == (len(Actions),)

    loss = - torch.dot(EligibilityVector, DiscountedReturns)
    
    assert loss.dim() == 0
    
    optim.zero_grad()
    loss.backward()
    optim.step()

    print(f'Step: {steps}: Reward={sum(Rewards)}')



Step: 9980: Reward=20.0
Step: 9966: Reward=14.0
Step: 9953: Reward=13.0
Step: 9931: Reward=22.0
Step: 9866: Reward=65.0
Step: 9849: Reward=17.0
Step: 9830: Reward=19.0
Step: 9818: Reward=12.0
Step: 9798: Reward=20.0
Step: 9767: Reward=31.0
Step: 9736: Reward=31.0
Step: 9714: Reward=22.0
Step: 9702: Reward=12.0
Step: 9648: Reward=54.0
Step: 9632: Reward=16.0
Step: 9577: Reward=55.0
Step: 9549: Reward=28.0
Step: 9472: Reward=77.0
Step: 9407: Reward=65.0
Step: 9392: Reward=15.0
Step: 9357: Reward=35.0
Step: 9328: Reward=29.0
Step: 9298: Reward=30.0
Step: 9244: Reward=54.0
Step: 9221: Reward=23.0
Step: 9151: Reward=70.0
Step: 9107: Reward=44.0
Step: 9033: Reward=74.0
Step: 8919: Reward=114.0
Step: 8852: Reward=67.0
Step: 8821: Reward=31.0
Step: 8717: Reward=104.0
Step: 8665: Reward=52.0
Step: 8594: Reward=71.0
Step: 8538: Reward=56.0
Step: 8495: Reward=43.0
Step: 8415: Reward=80.0
Step: 8374: Reward=41.0
Step: 8334: Reward=40.0
Step: 8245: Reward=89.0
Step: 8182: Reward=63.0
Step: 8131: Re

In [8]:
for _ in range(5):
    obs = torch.tensor(env.reset(), dtype=torch.float)    
    done = False
    env.render()
    Rewards = []
    while not done:
        probs = nn(obs)
        c = torch.distributions.Categorical(probs=probs)        
        action = c.sample()            
        action = action.item()
        
        obs_, rew, done, _info = env.step(action)
        Rewards.append(rew)
        env.render()

        obs = torch.tensor(obs_, dtype=torch.float)

    print(f'Reward: {sum(Rewards)}')
env.close()

Reward: 200.0
Reward: 200.0
Reward: 200.0
Reward: 200.0
Reward: 200.0
