In [2]:
pip install gym

Collecting gym
  Downloading https://files.pythonhosted.org/packages/77/48/c43b8a72b916cc70896aa431b0fc00d1481ae34e28dc55e2144f4c77916b/gym-0.17.1.tar.gz (1.6MB)
Collecting pyglet<=1.5.0,>=1.4.0 (from gym)
  Downloading https://files.pythonhosted.org/packages/70/ca/20aee170afe6011e295e34b27ad7d7ccd795faba581dd3c6f7cec237f561/pyglet-1.5.0-py2.py3-none-any.whl (1.0MB)
Building wheels for collected packages: gym
  Building wheel for gym (setup.py): started
  Building wheel for gym (setup.py): finished with status 'done'
  Stored in directory: C:\Users\TAN\AppData\Local\pip\Cache\wheels\c0\84\61\523b92d88787ae29689b3cc08cf445d8d8186d7fbe1acbf87b
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.17.1 pyglet-1.5.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import gym

env = gym.make("Taxi-v3").env

env.render()

+---------+
|R: | : :[34;1mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [3]:
env.reset()

484

In [4]:
env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B:[43m [0m|
+---------+



In [5]:
state = env.encode(3,1,2,0)
print("State:", state)

env.s = state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [6]:
env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

# SARSA ALGORITHM FOR TAXI PROBLEM

In [25]:
import numpy as np
def eps_greedy(Q, s, eps=0.1):
    '''
    Epsilon greedy policy
    '''
    if np.random.uniform(0,1) < eps:
        # Choose a random action
        return np.random.randint(Q.shape[1])
    else:
        # Choose the action of a greedy policy
        return greedy(Q, s)


def greedy(Q, s):
    '''
    Greedy policy
    return the index corresponding to the maximum action-state value
    '''
    return np.argmax(Q[s])


def run_episodes(env, Q, num_episodes=100, to_print=False):
    '''
    Run some episodes to test the policy
    '''
    tot_rew = []
    state = env.reset()

    for _ in range(num_episodes):
        done = False
        game_rew = 0

        while not done:
            # select a greedy action
            next_state, rew, done, _ = env.step(greedy(Q, state))

            state = next_state
            game_rew += rew 
            if done:
                state = env.reset()
                tot_rew.append(game_rew)

    if to_print:
        print('Mean score: %.3f of %i games!'%(np.mean(tot_rew), num_episodes))

    return np.mean(tot_rew)


def SARSA(env, lr=0.01, num_episodes=100, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    # Initialize the Q matrix
    # Q: matrix nS*nA where each row represent a state and each colums represent a different action
    Q = np.zeros((nS, nA))
    games_reward = []
    test_rewards = []

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0

        # decay the epsilon value until it reaches the threshold of 0.01
        if eps > 0.01:
            eps -= eps_decay


        action = eps_greedy(Q, state, eps) 

        # loop the main body until the environment stops
        while not done:
            next_state, rew, done, _ = env.step(action) # Take one step in the environment

            # choose the next action (needed for the SARSA update)
            next_action = eps_greedy(Q, next_state, eps) 
            # SARSA update
            Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])

            state = next_state
            action = next_action
            tot_rew += rew
            if done:
                games_reward.append(tot_rew)

        # Test the policy every 300 episodes and print the results
        if (ep % 300) == 0:
            test_rew = run_episodes(env, Q, 1000)
            #print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
            print("Episode", ep, "Eps", eps, "Reward", test_rew, "State", state, "Action", action)
            test_rewards.append(test_rew)

    return Q,state,action


if __name__ == '__main__':
    env = gym.make('Taxi-v3')
    #Q_sarsa = SARSA(env, lr=.1, num_episodes=3000, eps=0.4, gamma=0.95, eps_decay=0.001)
    Q_sarsa = SARSA(env, lr=.1, num_episodes=5000, eps=0.9, gamma=0.9, eps_decay=0.001)

Episode 0 Eps 0.899 Reward -207.2 State 476 Action 1
Episode 300 Eps 0.5989999999999998 Reward -200.0 State 410 Action 0
Episode 600 Eps 0.2989999999999995 Reward -176.403 State 410 Action 0
Episode 900 Eps 0.009999999999999232 Reward -126.739 State 410 Action 0
Episode 1200 Eps 0.009999999999999232 Reward -73.274 State 410 Action 0
Episode 1500 Eps 0.009999999999999232 Reward -38.699 State 475 Action 0
Episode 1800 Eps 0.009999999999999232 Reward -25.358 State 0 Action 0
Episode 2100 Eps 0.009999999999999232 Reward -16.367 State 0 Action 0
Episode 2400 Eps 0.009999999999999232 Reward -9.648 State 475 Action 0
Episode 2700 Eps 0.009999999999999232 Reward -3.875 State 414 Action 3
Episode 3000 Eps 0.009999999999999232 Reward -2.138 State 85 Action 0
Episode 3300 Eps 0.009999999999999232 Reward 1.223 State 85 Action 0
Episode 3600 Eps 0.009999999999999232 Reward 2.915 State 0 Action 0
Episode 3900 Eps 0.009999999999999232 Reward 3.92 State 0 Action 0
Episode 4200 Eps 0.009999999999999232