Q-learning

In [None]:
import numpy as np
from myenv import MyEnv
import math

class CartPoleSolver():
    
    def __init__(self, gamma=1.0, epsilon=1.0, alpha=0.01, episodes=10, batch_size=1000, interval_num=6):
        self.env = MyEnv()
        self.gamma = gamma # 折扣因子
        self.epsilon = epsilon # 贪婪策略参数
        self.alpha = alpha # 学习率
        self.episodes = episodes # 决策序列长度
        self.batch_size = batch_size # 训练次数
        self.interval_num = interval_num # 连续变量转离散变量分为几段

        self.pa_bin = np.linspace(-math.pi, math.pi, interval_num+1)[1: -1]
        self.pv_bin = np.linspace(-math.pi*15, math.pi*15, interval_num+1)[1: -1]

        self.q_table = np.random.uniform(low=0, high=1, size=(interval_num**4, 3))
        
    def get_state_index(self, observation):
        pole_angle, pole_v = observation
        
        state_index = 0
        state_index += np.digitize(pole_angle, bins = self.pa_bin) * self.interval_num
        state_index += np.digitize(pole_v, bins = self.pv_bin)
        
        return state_index
    
    def update_Q_table(self, observation, action, reward, next_observation):        
        state_index = self.get_state_index(observation)
        next_state_index = self.get_state_index(next_observation)
        
        maxQ = max(self.q_table[next_state_index][:])
        self.q_table[state_index, action] = self.q_table[state_index, action] + self.alpha * (reward + self.gamma*maxQ - self.q_table[state_index, action])
        
    def decide_action(self, observation, episode = 0):
        
        state = self.get_state_index(observation)
        # epsilon = 0.5 * (1 / (episode + 1))
        if self.epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(3)
            
        return action
    def run(self):
        observation = self.env.reset()
        for t in range(1000):
            self.env.render()
            print(observation)
            action = self.decide_action(observation)
            next_observation, reward, _, _ = self.env.step(action)
            self.update_Q_table(observation, action, reward, next_observation)
            observation = next_observation
    
    def solve(self):
        for _ in range(self.episodes):
            self.run()

In [None]:
a = CartPoleSolver()
a.run()

In [4]:
import numpy as np

a = np.array([[1,2], [2, 3]])
a-a

array([[0, 0],
       [0, 0]])