# 根據 CartPole 的遊戲，來實作 Policy Gradient
---

In [1]:
%matplotlib inline
import gym
import matplotlib.pyplot as plt
from  RLAgent_PolicyGradient import PolicyGradient
import numpy as np
import keyboard

## 建造遊戲環境
---
![](imgs/input_output.png "環境參數圖")
[原始連結](https://github.com/openai/gym/wiki/CartPole-v0)

In [2]:
env = gym.make("CartPole-v0")                # 立竿子的遊戲
env.seed(1)     # reproducible, general Policy gradient has high variance
env = env.unwrapped                          # 遊戲設定，好像不要讓參數有限制

print(env.action_space)
print(env.action_space.n)
print(env.observation_space)
print(env.observation_space.shape[0])
print(env.observation_space.high)            # 最大值
print(env.observation_space.low)             # 最小值

Discrete(2)
2
Box(4,)
4
[  4.80000000e+00   3.40282347e+38   4.18879020e-01   3.40282347e+38]
[ -4.80000000e+00  -3.40282347e+38  -4.18879020e-01  -3.40282347e+38]


## 測試遊戲
---
Example:  
<pre>array([ 0.10472821,  3.1417834 ,  1.96147299,  5.57941823]), 0.0, True, {}</pre> 
輸出會有四個東西：  
1. 代表 Observation 的值  
2. Reward (這裡注意，在還可以救起來之前，Reward 都是 1，且 IsDone 為 False) 
3. IsDone
4. Info 好像沒有用  

而** Action 是一個 Int => 0 or 1 !!**

In [3]:
# 重製 & render
# print(env.reset())
# env.render()

In [4]:
# env.render()
# print(env.step(1))

In [5]:
# env.render(close=True)

## 開始跑結果
---

In [6]:
# 創建 Agent
Agent = PolicyGradient(
    env.action_space.n,
    env.observation_space.shape[0],
    LearningRate = 0.02,
    RewardDecay = 0.99,
    #IsOutputGraph = True
)

In [7]:
# Training Part
def TrainModel(RenderThresold = 400):
    totalReward = 0
    IsRender = False
    for i in range(0, 3000):
        # 歸零
        observation = env.reset()

        # 開始模擬
        while True:
            # redner 畫面
            if(IsRender):
                env.render()

            # 選擇的動作
            actionValue = Agent.chooseAction(observation)

            # 選擇動作後 的結果
            nextObservation, reward, IsDone, Info = env.step(actionValue)

            # 存進記憶庫裡
            Agent.storeTransition(
                observation=observation,
                action=actionValue,
                reward=reward
            )


            if IsDone:
                # 計算 Reward
                if(i == 0):
                    totalReward = np.sum(Agent.MemoryReward)
                else:
                    totalReward = totalReward * 0.99 + np.sum(Agent.MemoryReward) * 0.01
                print("Epilson " + format(i + 1) + " Reward: " + format(totalReward))
                
                # 判斷是否到結束
                if(totalReward > RenderThresold):
                    return

                # 學習
                Agent.learn()
                break

            observation = nextObservation

In [8]:
def RunModel():
    observation = env.reset()

    # 開始模擬
    while True:
        # redner 畫面
        env.render()

        # 選擇的動作
        actionValue = Agent.chooseAction(observation)
        
        # 選擇動作後 的結果
        nextObservation, reward, IsDone, Info = env.step(actionValue)
        observation = nextObservation
        
        # 如果按下 Ｑ 代表結束
        if keyboard.is_pressed("q"):
            break

In [9]:
# 訓練 Model
TrainModel()

Epilson 1 Reward: 35.0
Epilson 2 Reward: 34.83
Epilson 3 Reward: 34.671699999999994
Epilson 4 Reward: 34.784983
Epilson 5 Reward: 34.73713316999999
Epilson 6 Reward: 34.57976183829999
Epilson 7 Reward: 34.403964219916986
Epilson 8 Reward: 34.449924577717816
Epilson 9 Reward: 34.44542533194064
Epilson 10 Reward: 34.33097107862123
Epilson 11 Reward: 34.37766136783502
Epilson 12 Reward: 34.323884754156666
Epilson 13 Reward: 34.3806459066151
Epilson 14 Reward: 34.85683944754895
Epilson 15 Reward: 34.698271053073455
Epilson 16 Reward: 34.75128834254272
Epilson 17 Reward: 34.79377545911729
Epilson 18 Reward: 34.81583770452612
Epilson 19 Reward: 34.93767932748086
Epilson 20 Reward: 35.09830253420605
Epilson 21 Reward: 35.31731950886399
Epilson 22 Reward: 35.33414631377535
Epilson 23 Reward: 35.1508048506376
Epilson 24 Reward: 35.359296802131226
Epilson 25 Reward: 35.38570383410992
Epilson 26 Reward: 35.93184679576882
Epilson 27 Reward: 35.81252832781114
Epilson 28 Reward: 35.85440304453302
Ep

In [10]:
# 使否要顯示 Model
# 按 Q 結束
RunModel()

In [11]:
# 關閉程式
Agent.session.close()
env.render(close=True)