In [1]:
import torch
import torchvision
import numpy as np
import gym

env = gym.make('CartPole-v0')
print(env.observation_space)
print(env.action_space)

n_features = env.observation_space.shape[0]
n_actions = env.action_space.n
n_hidden = 10
print('obs:%d, hidden:%d, act:%d' % (n_features, n_hidden, n_actions))

Box(4,)
Discrete(2)


In [3]:
class PGNetwork:
    def __init__(self, n_features, n_hidden, n_actions, gamma=0.95, learning_rate=0.05):
        self.n_features = n_features
        self.n_hidden = n_hidden
        self.n_actions = n_actions
        self.gamma = gamma
        
        self.obs = []
        self.act_probs = []
        self.actions = []
        self.rs = []
        self.net = self._build_net(self.n_features, self.n_hidden, self.n_actions)
        
        self.opt = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
        
        self.max_mean = 0
        self.max_std = 0
        
    def _build_net(self, n_features, n_hidden, n_actions):
        net = torch.nn.Sequential(
            torch.nn.Linear(in_features=n_features, out_features=n_hidden),
            torch.nn.Tanh(),
            torch.nn.Linear(in_features=n_hidden, out_features=n_actions),
            torch.nn.Softmax()
        )
        return net
    
    def choose_action(self, obs):
        act_prob = self.net(torch.Tensor(obs))
        action = torch.multinomial(act_prob, 1)[0]
        return act_prob, action
        
    def add_sample(self, o, ap, a, r):
        self.obs.append(o)
        self.act_probs.append(ap.view(1, self.n_actions))
        self.actions.append(a)
        self.rs.append(r)
        
    def learn(self):
        actions_prob = torch.cat(self.act_probs, dim=0)
        actions = torch.LongTensor(self.actions)
        actions_onehot = torch.nn.functional.one_hot(actions, self.n_actions)
        
        value = torch.zeros(len(self.rs))
        for i in reversed(range(len(self.rs))):
            if i == len(self.rs) - 1:
                value[i] = self.rs[i]
            else:
                value[i] = value[i + 1] * self.gamma + self.rs[i]
        if torch.mean(value) > self.max_mean:
            self.max_mean = torch.mean(value)
            self.max_std = torch.std(value)
        value -= self.max_mean
        value /= self.max_std

        neg_log_prob = torch.sum(-torch.log(actions_prob) * actions_onehot, dim=1)
        loss = torch.mean(neg_log_prob * value)

        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
        
        self.obs = []
        self.act_probs = []
        self.actions = []
        self.rs = []
    
    def save(self, save_name):
        torch.save(self.net, save_name)
        
    def load(self, load_name):
        self.net = torch.load(load_name)

In [6]:
pgn = PGNetwork(n_features, n_hidden, n_actions)
eps = 300
for ep in range(eps):
    obs = env.reset()
    while True:
        env.render()
        act_prob, action = pgn.choose_action(obs)
        obs_, _, done, _ = env.step(action.numpy())
        x, x_dot, theta, theta_dot = obs_
        r1 = (env.x_threshold - abs(x))/env.x_threshold
        r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians
        r = r1 + r2

        pgn.add_sample(obs, act_prob, action, r)
        if done:
            break
        obs = obs_
    pgn.learn()
    
    percent = (ep + 1) / eps
    print('\r%d / %d:' % (ep + 1, eps) + '■' * int(20 * percent) + '□' * (20 - int(20 * percent)), end='', flush = True)
env.close()

300 / 300:■■■■■■■■■■■■■■■■■■■■

In [5]:
env.close()

In [1]:
import numpy as np
import gym
import torch

env = gym.make('CartPole-v0')
model = torch.load('./model/PolicyGradient.pkl')

for i in range(10):
    step = 0
    obs = env.reset()
    while(True):
        env.render()
        act = torch.argmax(model(torch.Tensor(obs)))
        obs, _, done, _ = env.step(act.numpy())
        step += 1
        if done:
            print('Total steps:', step)
            break
env.close()