In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import numpy as np
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

class Network(nn.Module):
    def __init__(self, state_space, action_space, seed):
        super(Network, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(state_space, 16),
            nn.ReLU(),
#             nn.Linear(64, 64),
#             nn.ReLU(),
            nn.Linear(16, action_space)
        )

    def forward(self, x):
        return F.softmax(self.model(torch.tensor(x).float()))
    
    
    def act(self, x):
        # https://pytorch.org/docs/stable/distributions.html#score-function
        probabilities = self.forward(x)
        m = Categorical(probabilities)
        action = m.sample()
        log = m.log_prob(action)
        
        return action, log
        

In [20]:
import gym
env = gym.make("CartPole-v1")

state_space = env.observation_space.shape[0]
action_space =env.action_space.n

In [21]:
max_episodes = 2000
max_t = 200
update_every = 4
gamma = 0.99

In [22]:
model = Network(state_space, action_space, 1234)
optimiser = optim.Adam(model.parameters(),lr=1e-2)

scores_deque = deque(maxlen=100)

for episode in range(max_episodes):
    state = env.reset()
    
    policy_log = []
    reward_log = []
    
    for t in range(max_t):
        action, log = model.act(state)
        state, reward, done, info = env.step(action.item())
        
        reward_log.append(reward)
        policy_log.append(log)
        
        if done:
            break
        
    discounts = [gamma ** i for i in range(len(reward_log)+1)]    
    rewards = [discount * reward for (discount, reward) in zip(discounts, reward_log)]
    G = np.array(rewards).sum()
    
    policy_loss = np.array([ -log * G for log in policy_log]).sum()
    
    optimiser.zero_grad()
    policy_loss.backward()
    optimiser.step()
    
    scores_deque.append(np.array(reward_log).sum())
    
    
    if episode % 100 == 0:
        print("Episode: {}\tmean reward: {}".format(episode, np.array(scores_deque).mean()))
        



Episode: 0	mean reward: 47.0
Episode: 100	mean reward: 9.55
Episode: 200	mean reward: 9.33
Episode: 300	mean reward: 9.31
Episode: 400	mean reward: 9.39
Episode: 500	mean reward: 9.32
Episode: 600	mean reward: 9.3
Episode: 700	mean reward: 9.31
Episode: 800	mean reward: 9.25
Episode: 900	mean reward: 9.47
Episode: 1000	mean reward: 9.35
Episode: 1100	mean reward: 9.33
Episode: 1200	mean reward: 9.53
Episode: 1300	mean reward: 9.42
Episode: 1400	mean reward: 9.34
Episode: 1500	mean reward: 9.15
Episode: 1600	mean reward: 9.3
Episode: 1700	mean reward: 9.42
Episode: 1800	mean reward: 9.27
Episode: 1900	mean reward: 9.37
