In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

from train_ram import train
from agent import Agent
from config import DEVICE as device

In [2]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

## Ablation Study

Ablation的本意是`切除`，Ablation Study就是通过控制变量法，去除掉系统的一部分后，观察系统的表现，来理解每个独立组成部分对于整个系统的重要性。这里，我们主要考察两个部分的重要性：
  * Baseline
  * Normalization

我们将在其他条件不变的情况下研究：
  * 同时去除Baseline和Normalization
  * 只去除Normalization
  * 只去除Baseline
  * 同时使用Baseline和Normalization

然后，通过他们的训练表现，来判断每个部分的意义

### Remove Baseline and Normalization

In [3]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=False, normalize=False)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1500, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

IndexError: too many indices for tensor of dimension 2

### Only remove Normalization

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=True, normalize=False)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

### Only remove Baseline (Still have implicit Baseline)

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=False, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

### Apply both Baseline and Normalization

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

## Tune Learning Rate
接下来，我们在小范围内调整学习率，来查看整个算法对学习率的敏感程度。值得一提的是，大部分的强化学习算法对于学习率都很敏感

### lr-0.001

In [None]:
agent = Agent(state_size, action_size, lr=0.001, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

### lr=0.01

In [None]:
agent = Agent(state_size, action_size, lr=0.01, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

### lr=0.05

In [None]:
agent = Agent(state_size, action_size, lr=0.05, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

## Network Structure

在介绍网络结构时我们说过，如果Actor和Critic有相同的前几层，可以认为前几层在同时学习两个task，主流的观念认为multi-task learning对于神经网络的训练整体是有帮助的，这里我们也给大家展示Actor和Critic完全分开的情况

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=False, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=False, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

## Tune Reward Scale
很多时候，Reward的大小和范围会对训练算法有影响，在之前的实验中我们将Reward放缩为了原来的0.01倍，下面我们使用不同的放缩率，再看算法的表现情况

### scale=0.1

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.1)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.1))
plt.legend()
plt.show()

### scale=1

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=1)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 1))
plt.legend()
plt.show()

### scale=0.001

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.001)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.001))
plt.legend()
plt.show()

## Tune Learning Frequency

然后我们查看如果利用更多样本来学习是否会提升效果

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1500, update_frequency=2, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, share=True, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1500, update_frequency=4, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

## MC or TD?

最后，我们将计算G的方式从Monte Carlo切换为Temporal Difference，查看算法的表现

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, mode='TD', share=False, use_critic=True, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, mode='TD', share=True, use_critic=True, normalize=False)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, mode='TD', share=False, use_critic=False, normalize=True)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()

In [None]:
agent = Agent(state_size, action_size, lr=0.005, gamma=0.99, device=device, mode='TD', share=False, use_critic=False, normalize=False)
rewards, average_log = train(agent, env, n_episode=2000, max_t=1000, scale=0.01)

plt.figure(figsize=(17, 8))
plt.plot(rewards, label='episodic reward', color='black')
plt.plot(average_log, label='moving average', color='green')
plt.title('Share={}, use_critic={}, normalize={}, lr={}, Scale={}'.format(agent.share, agent.use_critic, agent.normalize, agent.lr, 0.01))
plt.legend()
plt.show()