初回インストール物

In [1]:
!pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Note: you may need to restart the kernel to use updated packages.


コード

In [1]:
%matplotlib inline
from IPython import display
import matplotlib.pyplot as plt

import gym
import numpy as np
import time
import pprint

display_notebook = False # サーバ上でやる場合はTrueにすること（フレームレート下がるので不要ならFalse）

強化学習

ネットワーク

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam

In [3]:
from SAC.model import SACNet

学習

In [4]:
from SAC.learn import calc_critic_loss, calc_policy_loss, update_params, calc_entropy_loss, update_params, soft_update

In [5]:
def update(learner_model, optim,
           observation_buffer,
           action_buffer,
           reward_buffer,
           done_buffer,
           gamma=0.99,
           weights=1,
           tau=0.005):
    soft_update(learner_model.critic_target, learner_model.critic, tau)
    observations = torch.Tensor(observation_buffer)
    states = observations[1:]
    next_states = observations[:-1]
    actions = torch.Tensor(action_buffer)
    rewards = torch.Tensor(reward_buffer)
    dones = torch.Tensor(done_buffer)
    
    q1_loss, q2_loss, errors, mean_q1, mean_q2 =\
        calc_critic_loss(learner_model, states, actions, rewards, dones, next_states, weights, optim.alpha, gamma)
    policy_loss, entropies = calc_policy_loss(learner_model, states, weights, optim.alpha)

    update_params(
        optim.q1_optim, learner_model.critic.Q1, q1_loss, 40)
    update_params(
        optim.q2_optim, learner_model.critic.Q2, q2_loss, 40)
    update_params(
        optim.policy_optim, learner_model.policy, policy_loss, 40)

    if optim.entropy_tuning:
        entropy_loss = calc_entropy_loss(optim.log_alpha, optim.target_entropy, entropies, weights)
        update_params(optim.alpha_optim, None, entropy_loss)
        optim.alpha = optim.log_alpha.exp()

    #actor_model.policy.load_state_dict(learner_model.policy.state_dict())
    #episode_returns = batch["episode_return"][batch["done"]]
    stats = {
    #    "0_episode_returns": tuple(episode_returns.cpu().numpy()),
    #    "1_mean_episode_return": torch.mean(episode_returns).item(),
        "2_q1_loss": q1_loss.item(),
        "3_q2_loss": q2_loss.item(),
        "4_policy_loss": policy_loss.item(),
        "5_entropy_loss": entropy_loss.item(),
        "6_alpha": optim.alpha.item(),
        "7_entoropy": entropies.mean().item()
    }
    return stats
    

In [6]:
class optimizer:
    def __init__(self, model, entropy_tuning=True, learning_rate=0.01, device="cpu"):
        self.policy_optim = Adam(model.policy.parameters(), lr=learning_rate)
        self.q1_optim = Adam(model.critic.Q1.parameters(), lr=learning_rate)
        self.q2_optim = Adam(model.critic.Q2.parameters(), lr=learning_rate)

        self.entropy_tuning = entropy_tuning
        self.device = device

        if self.entropy_tuning:
            # Target entropy is -|A|.
            self.target_entropy = -torch.prod(torch.Tensor(
                model.num_actions).to(device)).item()
            # We optimize log(alpha), instead of alpha.
            self.log_alpha = torch.zeros(
                1, requires_grad=True, device=device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=learning_rate)
        else:
            # fixed alpha
            self.alpha = torch.tensor(ent_coef).to(self.device)

訓練

In [7]:

def train(model, optim, n_train_steps, env_name, tau=0.005, seed=None):
    start_time = time.time()
    env = gym.make(env_name)
    if seed != None:
        env.seed(seed)
    
    t = 0
    while(True):
        observation_buffer = []
        action_buffer = []
        reward_buffer = []
        done_buffer = []
        
        observation = env.reset()
        observation_buffer.append(observation)
        
        total_reward = 0
        while(True):
            agent_output, _ = model.act(observation)
            action = agent_output["action"].item()
            
            observation, reward, done, info = env.step(action) # 行動を環境に反映させる
            total_reward += reward
            
            observation_buffer.append(observation)
            reward_buffer.append(reward)
            action_buffer.append(action)

            if done:
                done_buffer.append(1)
            else:
                done_buffer.append(0)
            t += 1
            if done:
                display.clear_output(wait=True)
                stats = update(model, optim, observation_buffer, action_buffer, reward_buffer, done_buffer, gamma=0.99, tau=tau)
                
                print("step : "+str(t)+" / "+str(n_train_steps))
                print("sps : "+str(t/(time.time()-start_time)))
                print("total reward : "+str(total_reward))
                print("Stats:\n%s"%( pprint.pformat(stats) ) )
                break
        if t >= n_train_steps:
            break
    

In [8]:
def test(model, env_name, n_episode=1, seed=None):
    env = gym.make(env_name)
    if seed != None:
        env.seed(seed)
    
    sum_total_reward = 0
    for ep in range(n_episode):
        observation = env.reset()
        total_reward = 0
        
        while(True):
            img = env.render("rgb_array") # 画面の表示
            if display_notebook:
                plt.imshow()
                display.clear_output(wait=True)
                display.display(plt.gcf())

            action = model.act_greedy(observation)
            observation, reward, done, info = env.step(action.item()) # 行動を環境に反映させる
            total_reward += reward
            if done:
                print("total reward : "+str(total_reward))
                sum_total_reward += total_reward
                break
    print("average reward : "+str(sum_total_reward/n_episode))
    env.close()

In [9]:

env_name = "CartPole-v0" # 扱う環境の指定
#env_name = 'MountainCar-v0'
env_name = "Acrobot-v1"

if env_name == 'MountainCar-v0':
    n_train_steps = 50_000
    lr=0.001
    hidden_units=[256, 256]
    tau=0.02

if env_name == "Acrobot-v1":
    n_train_steps = 20_000
    lr=0.003
    hidden_units=[128, 128]
    tau=0.05
if env_name == "CartPole-v0":
    n_train_steps = 8_000
    lr=0.001
    hidden_units=[128, 128]
    tau=0.02

# parameter setting
env = gym.make(env_name)
observation_shape = env.observation_space.shape
num_actions = env.action_space.n
env.close()

# seed setting
seed = 0
torch.manual_seed(seed) 
#random.seed(seed)  
np.random.seed(seed)  


learner_model = SACNet(observation_shape=observation_shape, num_actions=num_actions)
optim = optimizer(learner_model, learning_rate=lr)

print("train")
train(learner_model, optim, n_train_steps, env_name, tau=tau, seed=seed)

print("\ntest")
# Greedyな行動のみでテスト
test(learner_model, env_name, n_episode=3, seed=seed)

print("\nhyper parameters")
print("lr : "+str(lr))
print("hidden units : "+str(hidden_units))
print("tau : "+str(tau))

step : 20379 / 20000
sps : 2029.0943494442502
total reward : -500.0
Stats:
{'2_q1_loss': 0.03711812570691109,
 '3_q2_loss': 0.027317799627780914,
 '4_policy_loss': -0.11278628557920456,
 '5_entropy_loss': -0.13494110107421875,
 '6_alpha': 0.8813081383705139,
 '7_entoropy': 1.0941256284713745}

test
total reward : -88.0
total reward : -87.0
total reward : -112.0
average reward : -95.66666666666667

hyper parameters
lr : 0.003
hidden units : [128, 128]
tau : 0.05


学習済みモデルの保存

In [13]:
import os
folder = "./save_data/" + env_name
os.makedirs(folder, exist_ok=True)
checkpointpath = os.path.expandvars(
    os.path.expanduser(folder+"/model.tar")
)


In [14]:
torch.save(
    {
        #"model_state_dict": model.state_dict(),
        "learner_model_state_dict": learner_model.state_dict(),
        "optimizer_state_dict": optim.state_dict(),
    },
    checkpointpath,
)