In [10]:
import torch
import torchvision
import numpy as np
import gym

env = gym.make('CartPole-v0')
print(env.observation_space)
print(env.action_space)

n_features = env.observation_space.shape[0]
n_actions = env.action_space.n
n_hidden = 10
print('obs:%d, hidden:%d, act:%d' % (n_features, n_hidden, n_actions))

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
Discrete(2)
obs:4, hidden:10, act:2


In [11]:
class DQNetwork:
    def __init__(self, n_features, n_hidden, n_actions, gamma=0.9, epsilon_max=0.9, epsilon_increase=0, memory_size=500, batch_size=32, learning_rate=0.05):
        self.n_features = n_features
        self.n_hidden = n_hidden
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon_max = epsilon_max
        self.epsilon_increase = epsilon_increase
        if self.epsilon_increase == 0:
            self.epsilon = self.epsilon_max
        else:
            self.epsilon = 0
        self.memory_size = memory_size
        self.memory = torch.zeros([memory_size, n_features * 2 + 2])
        self.batch_size = batch_size
        
        self.memory_count = 0
        self.learn_count = 0
        
        self.net_eval = self._build_net(self.n_features, self.n_hidden, self.n_actions)
        self.net_target = self._build_net(self.n_features, self.n_hidden, self.n_actions)
        self.net_target.requires_grad_(False)
        
        self.opt = torch.optim.Adam(self.net_eval.parameters(), lr=learning_rate)
        
    def _build_net(self, n_features, n_hidden, n_actions):
        net = torch.nn.Sequential(
            torch.nn.Linear(n_features, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_actions)
        )
        return net
    
    def choose_action(self, obs):
        if torch.rand(1) < self.epsilon:
            action = torch.argmax(self.net_eval(torch.Tensor(obs)))
        else:
            action = torch.randint(self.n_actions, (1,))[0]
        return action
    
    def save_memory(self, o, a, r, o_):
        sample = torch.cat((torch.Tensor(o), torch.Tensor([a, r]), torch.Tensor(o_)), dim=0)
        self.memory[self.memory_count % self.memory_size, :] = sample
        self.memory_count += 1
        
    def learn(self):
        if self.learn_count % 300:
            self.net_target.load_state_dict(self.net_eval.state_dict())
        if self.memory_count < self.memory_size:
            batch = self.memory[torch.randint(self.memory_count, (self.batch_size, ))]
        else:
            batch = self.memory[torch.randint(self.memory_size, (self.batch_size, ))]
        q_eval = self.net_eval(batch[:, :self.n_features])
        q_next = self.net_target(batch[:, -self.n_features:])
        q_target = q_eval.clone()
        action = batch[:, self.n_features].type(torch.int64)
        reward = batch[:, self.n_features + 1]

        q_target[torch.arange(self.batch_size), action] = reward + q_next.max(dim=1)[0] * self.gamma

        self.opt.zero_grad()
        loss = torch.nn.MSELoss()
        output = loss(q_eval, q_target)
        output.backward()
        self.opt.step()

        if self.epsilon < self.epsilon_max:
            self.epsilon += self.epsilon_increase
        self.learn_count += 1
    
    def save(self, save_name):
        torch.save(self.net_eval, save_name)
        
    def load(self, load_name):
        self.net_eval = torch.load(load_name)
        self.net_target = torch.load(load_name)

In [None]:
dqn = DQNetwork(n_features, n_hidden, n_actions, batch_size=100, epsilon_increase=0.005)
eps = 300
step = 0
for ep in range(eps):
    obs = env.reset()
    while(True):
        step += 1
        env.render()
        act = dqn.choose_action(obs)
        obs_, _, done, _ = env.step(act.numpy())
        
        # 计算奖励
        x, x_dot, theta, theta_dot = obs_
        r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.8
        r = r1 + r2
        
        dqn.save_memory(obs, act, r, obs_)
        if step > 200 and step % 5 == 0:
            dqn.learn()
        if done:
            break
        obs = obs_
        
    percent = (ep + 1) / eps
    print('\r%d / %d:' % (ep + 1, eps) + '■' * int(20 * percent) + '□' * (20 - int(20 * percent)), end='', flush = True)
env.close()

120 / 300:■■■■■■■■□□□□□□□□□□□□

In [1]:
import numpy as np
import gym
import torch

env = gym.make('CartPole-v0')
model = torch.load('./model/DeepQNetwork.pkl')

for i in range(10):
    step = 0
    obs = env.reset()
    while(True):
        env.render()
        act = torch.argmax(model(torch.Tensor(obs)))
        obs, _, done, _ = env.step(act.numpy())
        step += 1
        if done:
            print('Total steps:', step)
            break
env.close()