In [1]:
import torch
import torch.nn as nn
import torchvision
import numpy as np
import gym

env = gym.make('Pendulum-v0')
print(env.observation_space)
print(env.action_space)

n_features = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
b_actions = env.action_space.high[0]
n_hiddens = 30

print("n_features:", n_features, "n_actions:", n_actions, "b_actions:", b_actions)
print(torch.cuda.is_available())

Box(-8.0, 8.0, (3,), float32)
Box(-2.0, 2.0, (1,), float32)
n_features: 3 n_actions: 1 b_actions: 2.0
True


In [2]:
class Actor(nn.Module):
    def __init__(self, n_features, n_hiddens, n_actions, b_actions):
        super().__init__()
        self.n_features = n_features
        self.n_hiddens = n_hiddens
        self.n_actions = n_actions
        self.b_actions = b_actions

        self.fc1 = nn.Linear(n_features, n_hiddens)
        self.fc2 = nn.Linear(n_hiddens, n_actions)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

    def forward(self, x):
        assert x.shape[-1] == self.n_features

        x = self.relu(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = x * self.b_actions
        return x

class Critic(nn.Module):
    def __init__(self, n_features, n_actions, n_hiddens):
        super().__init__()
        self.n_features = n_features
        self.n_actions = n_actions
        self.n_hiddens = n_hiddens

        self.fc_s = nn.Linear(n_features, n_hiddens)
        self.fc_a = nn.Linear(n_actions, n_hiddens)
        self.fc = nn.Linear(n_hiddens, 1)
        self.relu = nn.ReLU()

    def forward(self, s, a):
        assert s.shape[-1] == self.n_features
        assert a.shape[-1] == self.n_actions
        
        s = self.fc_s(s)
        a = self.fc_a(a)
        x = self.fc(self.relu(s + a))
        return x
        
class DDPGNetwork: 
    def __init__(self, n_features, n_actions, b_actions, n_hiddens):
        self.n_features = n_features
        self.n_actions = n_actions
        self.b_actions = b_actions
        self.n_hiddens = n_hiddens
        self.pointer = 0
        self.capacity = 10000
        self.memory = np.zeros((self.capacity, n_features * 2 + n_actions + 1), dtype=np.float32)
        self.tau = 0.02
        
        self.actor = Actor(n_features, n_hiddens, n_actions, b_actions)
        self.actor_target = Actor(n_features, n_hiddens, n_actions, b_actions)
        self.critic = Critic(n_features, n_actions, n_hiddens)
        self.critic_target = Critic(n_features, n_actions, n_hiddens)
        
        self.actor_op = torch.optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_op = torch.optim.Adam(self.critic.parameters(), lr=0.001)
        
        self.mse = nn.MSELoss()

    def choose_action(self, s):
        a = self.actor(torch.Tensor(s)).detach().numpy()
        return a
    
    def save_memory(self, s, a, r, s_):
        self.memory[self.pointer % self.capacity] = np.hstack([s, a, [r], s_])
        self.pointer += 1
    
    def learn(self, batch_size):
        if self.pointer > self.capacity:
            batch_index = np.random.choice(self.capacity, size=batch_size)
        else:
            batch_index = np.random.choice(self.pointer, size=batch_size)
        
        s = torch.from_numpy(self.memory[batch_index, :self.n_features])
        a = torch.from_numpy(self.memory[batch_index, self.n_features:self.n_features + self.n_actions])
        r = torch.from_numpy(self.memory[batch_index, -self.n_features-1:-self.n_features])
        s_ = torch.from_numpy(self.memory[batch_index, -self.n_features:])
        
        q = self.critic(s, a)
        a_ = self.actor_target(s_)
        q_ = self.critic_target(s_, a_)
        q_ = r + 0.9 * q_
        critic_loss = self.mse(q_, q)
        self.critic_op.zero_grad()
        critic_loss.backward()
        self.critic_op.step()
        
        actor_loss = -torch.mean(self.critic(s, self.actor(s)))
        self.actor_op.zero_grad()
        actor_loss.backward()
        self.actor_op.step()
        
        self.update_net(self.tau)
        
    def update_net(self, tau):
        for param, param_target in zip(self.actor.parameters(), self.actor_target.parameters()):
            param_target.data.copy_(tau * param.data + (1 - tau) * param_target.data)
        for param, param_target in zip(self.critic.parameters(), self.critic_target.parameters()):
            param_target.data.copy_(tau * param.data + (1 - tau) * param_target.data)
        
    def save(self, actor_path, critic_path):
        torch.save(self.actor, actor_path)
        torch.save(self.critic, critic_path)
    
    def load(self, actor_path, critic_path):
        self.actor = torch.load(actor_path)
        self.critic = torch.load(critic_path)

In [5]:
ddpg = DDPGNetwork(n_features, n_actions, b_actions, n_hiddens)
eps = 200
var = 3
for ep in range(eps):
    s = env.reset()
    ep_r = 0
    while(True):
        env.render()
        a = ddpg.choose_action(s)
        a = np.clip(np.random.normal(a, var), -2, 2)
        s_, r, done, _ = env.step(a)
        ep_r += r
        
        ddpg.save_memory(s, a, r/10, s_)
        if ddpg.pointer >= ddpg.capacity:
            var *= 0.9995
            ddpg.learn(100)
            
        s = s_
        
        if done:
            print('epoch %d: total reward %d.' % (ep, ep_r))
            break
        
env.close()

epoch 0: total reward -1033.
epoch 1: total reward -1448.
epoch 2: total reward -1285.
epoch 3: total reward -1272.
epoch 4: total reward -1029.
epoch 5: total reward -1014.
epoch 6: total reward -1293.
epoch 7: total reward -1185.
epoch 8: total reward -1372.
epoch 9: total reward -1537.
epoch 10: total reward -1625.
epoch 11: total reward -1502.
epoch 12: total reward -1012.
epoch 13: total reward -1707.
epoch 14: total reward -1648.
epoch 15: total reward -1751.
epoch 16: total reward -1553.
epoch 17: total reward -1494.
epoch 18: total reward -1288.
epoch 19: total reward -1647.
epoch 20: total reward -1150.
epoch 21: total reward -1650.
epoch 22: total reward -1309.
epoch 23: total reward -1441.
epoch 24: total reward -1478.
epoch 25: total reward -1089.
epoch 26: total reward -1447.
epoch 27: total reward -993.
epoch 28: total reward -1697.
epoch 29: total reward -1671.
epoch 30: total reward -1577.
epoch 31: total reward -1138.
epoch 32: total reward -1440.
epoch 33: total rewar

In [6]:
ddpg.save('model/DDPG_Actor.pkl', 'model/DDPG_Critic.pkl')

In [7]:
ddpg = DDPGNetwork(n_features, n_actions, b_actions, n_hiddens)
ddpg.load('model/DDPG_Actor.pkl', 'model/DDPG_Critic.pkl')

eps = 10
for ep in range(eps):
    s = env.reset()
    ep_r = 0
    while(True):
        env.render()
        a = ddpg.choose_action(s)
        s, r, done, _ = env.step(a)
        ep_r += r
        
        if done:
            print('epoch %d: total reward %d.' % (ep, ep_r))
            break
env.close()

epoch 0: total reward -263.
epoch 1: total reward -129.
epoch 2: total reward -131.
epoch 3: total reward -125.
epoch 4: total reward -129.
epoch 5: total reward -390.
epoch 6: total reward -128.
epoch 7: total reward -129.
epoch 8: total reward -5.
epoch 9: total reward -132.
