In [8]:
# Implementation of reinforce in pytorch. 
# See p. 326 Reinforcement Learning 2nd ed. Sutton & Barto 
# By Andriy Drozdyuk.
import torch
import gym

step = 0
max_steps = 10_000
lr = 0.005
γ = 0.9999

# %%
env = gym.make('CartPole-v0')

nn = torch.nn.Sequential(
    torch.nn.Linear(4, 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, env.action_space.n),
    torch.nn.Softmax(dim=-1)
)
optim = torch.optim.Adam(nn.parameters(), lr=lr)

assert isinstance(nn(torch.tensor(env.reset(), dtype=torch.float)), torch.Tensor)

In [9]:
while step < max_steps:
    obs = torch.tensor(env.reset(), dtype=torch.float)    
    done = False
    Actions, States, Rewards, EligibilityVector = [], [], [], []
    
    while not done:
        probs = nn(obs)
        c = torch.distributions.Categorical(probs=probs)        
        action = c.sample()        
        log_prob = c.log_prob(action)        
        action = action.item()
        
        assert isinstance(action, int)
        obs_, rew, done, _info = env.step(action)
        step += 1
        
        Actions.append(action)
        States.append(obs)
        Rewards.append(rew)
        EligibilityVector.append(log_prob)

        obs = torch.tensor(obs_, dtype=torch.float)

    DiscountedReturns = []
    for t in range(len((Rewards))):
        G = 0.0
        for k, r in enumerate(Rewards[t:]):
            G += (γ**k)*r
        G = (γ**t)*G
        DiscountedReturns.append(G)
    
    EligibilityVector = torch.stack(EligibilityVector)        
    DiscountedReturns = torch.tensor(DiscountedReturns, dtype=torch.float)
    
    assert EligibilityVector.shape == (len(Actions),)
    assert DiscountedReturns.shape == (len(Actions),)

    loss = - torch.dot(EligibilityVector, DiscountedReturns)
    
    assert loss.dim() == 0
    
    optim.zero_grad()
    loss.backward()
    optim.step()

    print(f'Step: {step}: Reward={sum(Rewards)}')



Step: 14: Reward=14.0
Step: 42: Reward=28.0
Step: 56: Reward=14.0
Step: 67: Reward=11.0
Step: 84: Reward=17.0
Step: 112: Reward=28.0
Step: 153: Reward=41.0
Step: 175: Reward=22.0
Step: 192: Reward=17.0
Step: 210: Reward=18.0
Step: 225: Reward=15.0
Step: 240: Reward=15.0
Step: 259: Reward=19.0
Step: 277: Reward=18.0
Step: 304: Reward=27.0
Step: 321: Reward=17.0
Step: 355: Reward=34.0
Step: 394: Reward=39.0
Step: 416: Reward=22.0
Step: 438: Reward=22.0
Step: 460: Reward=22.0
Step: 500: Reward=40.0
Step: 523: Reward=23.0
Step: 565: Reward=42.0
Step: 588: Reward=23.0
Step: 608: Reward=20.0
Step: 629: Reward=21.0
Step: 667: Reward=38.0
Step: 684: Reward=17.0
Step: 743: Reward=59.0
Step: 798: Reward=55.0
Step: 822: Reward=24.0
Step: 861: Reward=39.0
Step: 909: Reward=48.0
Step: 984: Reward=75.0
Step: 1018: Reward=34.0
Step: 1044: Reward=26.0
Step: 1070: Reward=26.0
Step: 1088: Reward=18.0
Step: 1139: Reward=51.0
Step: 1167: Reward=28.0
Step: 1191: Reward=24.0
Step: 1239: Reward=48.0
Step: 12

In [10]:
for _ in range(5):
    obs = torch.tensor(env.reset(), dtype=torch.float)    
    done = False
    env.render()
    Rewards = []
    while not done:
        probs = nn(obs)
        c = torch.distributions.Categorical(probs=probs)        
        action = c.sample()            
        action = action.item()
        
        obs_, rew, done, _info = env.step(action)
        Rewards.append(rew)
        env.render()

        obs = torch.tensor(obs_, dtype=torch.float)

    print(f'Reward: {sum(Rewards)}')
env.close()

Reward: 168.0
Reward: 123.0
Reward: 182.0
Reward: 200.0
Reward: 119.0
