# 根據 CartPole 的遊戲，來實作 Policy Gradient
---

In [None]:
%matplotlib inline
import gym
import matplotlib.pyplot as plt
from  RLAgent_PolicyGradient import PolicyGradient
import numpy as np
import keyboard

## 建造遊戲環境
---
![](imgs/input_output.png "環境參數圖")
[原始連結](https://github.com/openai/gym/wiki/CartPole-v0)

In [None]:
env = gym.make("CartPole-v0")                # 立竿子的遊戲
env.seed(1)     # reproducible, general Policy gradient has high variance
env = env.unwrapped                          # 遊戲設定，好像不要讓參數有限制

print(env.action_space)
print(env.action_space.n)
print(env.observation_space)
print(env.observation_space.shape[0])
print(env.observation_space.high)            # 最大值
print(env.observation_space.low)             # 最小值

## 測試遊戲
---
Example:  
<pre>array([ 0.10472821,  3.1417834 ,  1.96147299,  5.57941823]), 0.0, True, {}</pre> 
輸出會有四個東西：  
1. 代表 Observation 的值  
2. Reward (這裡注意，在還可以救起來之前，Reward 都是 1，且 IsDone 為 False) 
3. IsDone
4. Info 好像沒有用  

而** Action 是一個 Int => 0 or 1 !!**

In [None]:
# 重製 & render
# print(env.reset())
# env.render()

In [None]:
# env.render()
# print(env.step(1))

In [None]:
# env.render(close=True)

## 開始跑結果
---

In [None]:
# 創建 Agent
Agent = PolicyGradient(
    env.action_space.n,
    env.observation_space.shape[0],
    LearningRate = 0.02,
    RewardDecay = 0.99,
    # IsOutputGraph = True
)

In [None]:
# Training Part
def TrainModel(RenderThresold = 400):
    totalReward = 0
    IsRender = False
    for i in range(0, 3000):
        # 歸零
        observation = env.reset()

        # 開始模擬
        while True:
            # redner 畫面
            if(IsRender):
                env.render()

            # 選擇的動作
            actionValue = Agent.chooseAction(observation)

            # 選擇動作後 的結果
            nextObservation, reward, IsDone, Info = env.step(actionValue)

            # 存進記憶庫裡
            Agent.storeTransition(
                observation=observation,
                action=actionValue,
                reward=reward
            )


            if IsDone:
                # 計算 Reward
                if(i == 0):
                    totalReward = np.sum(Agent.MemoryReward)
                else:
                    totalReward = totalReward * 0.99 + np.sum(Agent.MemoryReward) * 0.01
                print("Epilson " + format(i + 1) + " Reward: " + format(totalReward))
                
                # 判斷是否到結束
                if(totalReward > RenderThresold):
                    return

                # 學習
                Agent.learn()
                break

            observation = nextObservation

In [None]:
def RunModel():
    observation = env.reset()

    # 開始模擬
    while True:
        # redner 畫面
        env.render()

        # 選擇的動作
        actionValue = Agent.chooseAction(observation)
        
        # 選擇動作後 的結果
        nextObservation, reward, IsDone, Info = env.step(actionValue)
        observation = nextObservation
        
        # 如果按下 Ｑ 代表結束
        if keyboard.is_pressed("q"):
            break

In [None]:
# 訓練 Model
TrainModel()

In [None]:
# 使否要顯示 Model
# 按 Q 結束
RunModel()

In [None]:
# 關閉程式
Agent.session.close()
env.render(close=True)