In [1]:
import torch
import torchvision
import numpy as np
import gym

env = gym.make('CartPole-v0')
print(env.observation_space)
print(env.action_space)

n_features = env.observation_space.shape[0]
n_actions = env.action_space.n
n_hidden = 10
print('obs:%d, hidden:%d, act:%d' % (n_features, n_hidden, n_actions))

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
Discrete(2)
obs:4, hidden:10, act:2


In [2]:
class ActorNet:
    def __init__(self, n_features, n_hidden, n_actions, learning_rate=0.001):
        self.n_features = n_features
        self.n_hidden = n_hidden
        self.n_actions = n_actions
        self.net = self._build_net(n_features, n_hidden, n_actions)
        self.opt = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
        
    def _build_net(self, n_features, n_hidden, n_actions):
        net = torch.nn.Sequential(
            torch.nn.Linear(n_features, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_actions),
            torch.nn.Softmax()
        )
        return net
        
    def choose_action(self, obs):
        act_prob = self.net(torch.Tensor(obs))
        action = torch.multinomial(act_prob, 1)[0]
        return act_prob, action
    
    def learn(self, obs, a, td_error):
        act_prob = self.net(torch.Tensor(obs))
        log_prob = torch.log(act_prob[a])
        loss = -log_prob * td_error
        
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
    
    def save(self, save_name):
        torch.save(self.net, save_name)
        
    def load(self, load_name):
        self.net = torch.load(load_name)
        
class CriticNet:
    def __init__(self, n_features, n_hidden, learning_rate=0.01, gamma=0.95):
        self.n_features = n_features
        self.n_hidden = n_hidden
        self.gamma = gamma
        self.net = self._build_net(n_features, n_hidden)
        self.opt = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
        
    def _build_net(self, n_features, n_hidden):
        net = torch.nn.Sequential(
            torch.nn.Linear(n_features, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, 1)
        )
        return net
    
    def learn(self, obs, reward, obs_):
        val = self.net(torch.Tensor(obs))
        val_ = self.net(torch.Tensor(obs_)).detach()
        td_error = reward + self.gamma * val_ - val
        loss = torch.square(td_error)
        
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
        
        return td_error.detach()
    
    def save(self, save_name):
        torch.save(self.net, save_name)
        
    def load(self, load_name):
        self.net = torch.load(load_name)

In [6]:
actor = ActorNet(n_features, n_hidden, n_actions)
critic = CriticNet(n_features, n_hidden)

stop = False
eps = 3000
for ep in range(eps):
    obs = env.reset()
    i = 0
    track_r = []
    while True:
        i += 1
        env.render()
        act_prob, action = actor.choose_action(obs)
        obs_, reward, done, info = env.step(action.numpy())
        if done and i < 200:
            reward = -20
        track_r.append(reward)
        td_error = critic.learn(obs, reward, obs_)
        actor.learn(obs, action, td_error)
        
        obs = obs_
        
        if done:
            ep_rs_sum = np.sum(track_r)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
            print('episode:', ep, ' reward:', int(running_reward), ' round:', i)
            if int(running_reward) > 170:
                stop = True
            break
    if stop:
        break
env.close()

episode: 0  reward: 74  round: 16
episode: 1  reward: 65  round: 11
episode: 2  reward: 60  round: 34
episode: 3  reward: 54  round: 22
episode: 4  reward: 48  round: 13
episode: 5  reward: 43  round: 18
episode: 6  reward: 38  round: 21
episode: 7  reward: 34  round: 12
episode: 8  reward: 29  round: 12
episode: 9  reward: 26  round: 14
episode: 10  reward: 23  round: 21
episode: 11  reward: 22  round: 34
episode: 12  reward: 19  round: 17
episode: 13  reward: 17  round: 21
episode: 14  reward: 16  round: 24
episode: 15  reward: 13  round: 13
episode: 16  reward: 13  round: 34
episode: 17  reward: 11  round: 12
episode: 18  reward: 12  round: 44
episode: 19  reward: 12  round: 34
episode: 20  reward: 12  round: 30
episode: 21  reward: 12  round: 30
episode: 22  reward: 12  round: 42
episode: 23  reward: 11  round: 20
episode: 24  reward: 9  round: 11
episode: 25  reward: 8  round: 17
episode: 26  reward: 7  round: 22
episode: 27  reward: 5  round: 13
episode: 28  reward: 6  round: 30


In [3]:
import numpy as np
import gym
import torch

env = gym.make('CartPole-v0')
model = torch.load('./model/AC_Actor.pkl')

for i in range(10):
    step = 0
    obs = env.reset()
    while(True):
        env.render()
        act = torch.argmax(model(torch.Tensor(obs)))
        obs, _, done, _ = env.step(act.numpy())
        step += 1
        if done:
            print('Total steps:', step)
            break
env.close()