In [18]:
#Sarsa
import time
import numpy as np
import gymnasium as gym
class SarsaAgent(object):
    def __init__(self, obs_n,act_n,learning_rate=0.01,gamma=0.9,e_greedy=0.005):
        self.act_n=act_n
        self.lr=learning_rate
        self.gamma=gamma
        self.epsilon=e_greedy
        self.Q=np.zeros((obs_n,act_n))
       
    def sample(self,obs):  
        if self.epsilon > 0: self.epsilon -= 0.0000001
        if np.random.uniform(0,1)<(1.0-self.epsilon):
            action=self.predict(obs)
        else:
            action=np.random.choice(self.act_n)
        return action
   
    def predict(self,obs):
        Q_list=self.Q[obs,:]
        maxQ=np.max(Q_list)
       
        action_list=np.where(Q_list==maxQ)[0]
        action=np.random.choice(action_list)
        return action
   
    def learn(self,obs,action,reward,next_obs,next_action,done):
       
        predict_Q=self.Q[obs,action]
        if done:
            target_Q=reward
        else:
            target_Q=reward+self.gamma*self.Q[next_obs,next_action]
        self.Q[obs,action]=predict_Q+self.lr*(target_Q-predict_Q)
       
       
def run_episode(env,agent,render=False):
    total_steps=0
    total_reward=0
   
    obs=env.reset()[0]
    action=agent.sample(obs)
   
    while True:
        next_obs,reward,done,truncated,info=env.step(action)
        next_action=agent.sample(next_obs)
       
        agent.learn(obs,action,reward,next_obs,next_action,done)
       
        action=next_action
        obs=next_obs
       
        total_reward+=reward
        total_steps+=1
       
        if render:
            env.render()
       
        if done:
            break
       
    return total_reward, total_steps
   
def test_episode(env,agent):
    total_reward=0
    obs=env.reset()[0]
    #print(f"obs type: {type(obs)}, value: {obs}")
    while True:
        action=agent.predict(obs)
        next_obs,reward,done,tru,info=env.step(action)
        total_reward+=reward
        obs=next_obs
        time.sleep(0.1)
        env.render()
        if done:
            break
    return total_reward


In [19]:

env=gym.make("CliffWalking-v0")
agent=SarsaAgent(
    obs_n=env.observation_space.n,
    act_n=env.action_space.n,
    learning_rate=0.1,
    gamma=0.9,
    e_greedy=0.1)
is_render=False
   
for episode in range(10000):
        
    ep_reward,ep_steps=run_episode(env,agent,is_render)
    print('Episode %s: steps=%s,reward=%.lf'%(episode,ep_steps,ep_reward))
       
       # if episode%20==0:
        #    is_render=True
        #else:z
         #   is_render=False
           
           
for episode in range(10):
    total_reward=test_episode(env,agent)
    print('teset %d reward = %.lf'%(episode,total_reward))

Episode 0: steps=830,reward=-2216
Episode 1: steps=294,reward=-492
Episode 2: steps=186,reward=-285
Episode 3: steps=430,reward=-826
Episode 4: steps=297,reward=-594
Episode 5: steps=120,reward=-120
Episode 6: steps=159,reward=-159
Episode 7: steps=148,reward=-247
Episode 8: steps=146,reward=-146
Episode 9: steps=71,reward=-71
Episode 10: steps=101,reward=-200
Episode 11: steps=147,reward=-147
Episode 12: steps=103,reward=-202
Episode 13: steps=89,reward=-89
Episode 14: steps=93,reward=-93
Episode 15: steps=162,reward=-162
Episode 16: steps=38,reward=-38
Episode 17: steps=186,reward=-285
Episode 18: steps=114,reward=-114
Episode 19: steps=40,reward=-139
Episode 20: steps=114,reward=-114
Episode 21: steps=44,reward=-44
Episode 22: steps=152,reward=-251
Episode 23: steps=90,reward=-90
Episode 24: steps=55,reward=-154
Episode 25: steps=123,reward=-123
Episode 26: steps=58,reward=-58
Episode 27: steps=77,reward=-77
Episode 28: steps=44,reward=-44
Episode 29: steps=116,reward=-314
Episode 3

In [5]:
#epsilon -= 0.01  ->  -13
agent.Q[24]

array([-7.18100419, -7.17570464, -7.19743283, -7.18179136])

In [8]:
#epsilon -= 0.001  ->  -13
agent.Q[24]

array([-7.17896399, -7.17570464, -7.20064965, -7.18710019])

In [11]:
#epsilon -= 0.0001  ->  -13
agent.Q[24]

array([-7.17955211, -7.17570464, -7.19729932, -7.21389891])

In [14]:
#epsilon -= 0.00001  ->  -13
agent.Q[24]

array([-7.18147152, -7.17570464, -7.20344373, -7.19425   ])

In [17]:
#epsilon -= 0.000001  ->  -15
agent.Q[24]

#引发一个问题 增加训练次数可以收敛吗(理论上)

array([ -7.71232075,  -8.52679442, -12.23321864,  -8.26062919])

In [20]:
#epsilon -= 0.0000001  ->  -17
agent.Q[24]

array([ -8.36048651, -11.47089318, -13.35743078,  -8.98157528])

In [23]:
import torch
torch.cuda.is_available()

False

In [2]:
import time
import numpy as np
import gymnasium as gym
class SarsaAgent(object):
    def __init__(self, obs_n,act_n,learning_rate=0.01,gamma=0.9,e_greedy=0.005):
        self.act_n=act_n
        self.lr=learning_rate
        self.gamma=gamma
        self.epsilon=e_greedy
        self.Q=np.zeros((obs_n,act_n))
       
    def sample(self,obs):
        if self.epsilon > 0: self.epsilon -= 0.0000001
        if np.random.uniform(0,1)<(1.0-self.epsilon):
            action=self.predict(obs)
        else:
            action=np.random.choice(self.act_n)
        return action
   
    def predict(self,obs):
        Q_list=self.Q[obs,:]
        maxQ=np.max(Q_list)
       
        action_list=np.where(Q_list==maxQ)[0]
        action=np.random.choice(action_list)
        return action
   
    def learn(self,obs,action,reward,next_obs,done):
        QList = self.Q[next_obs, :]
        Qmax = np.max(QList)
        predict_Q=self.Q[obs,action]
        if done:
            target_Q=reward
        else:
            target_Q=reward+self.gamma*Qmax
        self.Q[obs,action]=predict_Q+self.lr*(target_Q-predict_Q)
       
       
def run_episode(env,agent,render=False):
    total_steps=0
    total_reward=0
   
    obs=env.reset()[0]
    action=agent.sample(obs)
   
    while True:
        next_obs,reward,done,truncated,info=env.step(action)
        
       
        agent.learn(obs,action,reward,next_obs,done)
        next_action=agent.sample(next_obs)
        action=next_action
        obs=next_obs
       
        total_reward+=reward
        total_steps+=1
       
        if render:
            env.render()
       
        if done:
            break
       
    return total_reward, total_steps
   
def test_episode(env,agent):
    total_reward=0
    obs=env.reset()[0]
    #print(f"obs type: {type(obs)}, value: {obs}")
    while True:
        action=agent.predict(obs)
        next_obs,reward,done,tru,info=env.step(action)
        total_reward+=reward
        obs=next_obs
        time.sleep(0.1)
        env.render()
        if done:
            break
    return total_reward
def main():
    env=gym.make("CliffWalking-v0")
    agent=SarsaAgent(
            obs_n=env.observation_space.n,
            act_n=env.action_space.n,
            learning_rate=0.1,
            gamma=0.9,
            e_greedy=0.1)
    is_render=False
   
    for episode in range(10000):
        if episode == 10000:
            is_render = True
        ep_reward,ep_steps=run_episode(env,agent,is_render)
        print('Episode %s: steps=%s,reward=%.lf'%(episode,ep_steps,ep_reward))
       
       # if episode%20==0:
        #    is_render=True
        #else:
         #   is_render=False
           
           
    for episode in range(10):
        total_reward=test_episode(env,agent)
        print('teset %d reward = %.lf'%(episode,total_reward))
       
       
if __name__=='__main__':
    main()


Episode 0: steps=908,reward=-2294
Episode 1: steps=124,reward=-124
Episode 2: steps=374,reward=-869
Episode 3: steps=335,reward=-830
Episode 4: steps=253,reward=-352
Episode 5: steps=61,reward=-61
Episode 6: steps=294,reward=-492
Episode 7: steps=59,reward=-59
Episode 8: steps=225,reward=-225
Episode 9: steps=71,reward=-71
Episode 10: steps=196,reward=-295
Episode 11: steps=142,reward=-142
Episode 12: steps=112,reward=-310
Episode 13: steps=99,reward=-99
Episode 14: steps=49,reward=-49
Episode 15: steps=91,reward=-91
Episode 16: steps=164,reward=-263
Episode 17: steps=73,reward=-73
Episode 18: steps=139,reward=-238
Episode 19: steps=149,reward=-248
Episode 20: steps=76,reward=-175
Episode 21: steps=87,reward=-87
Episode 22: steps=91,reward=-91
Episode 23: steps=123,reward=-123
Episode 24: steps=56,reward=-56
Episode 25: steps=290,reward=-686
Episode 26: steps=114,reward=-213
Episode 27: steps=59,reward=-158
Episode 28: steps=63,reward=-162
Episode 29: steps=121,reward=-121
Episode 30: 