### Chapter2
#### E1

In [1]:
def Q_learning(gamma, alpha, epsilon, showReward=True, showQ=True):
    # Q learning for OpenAI Gym Taxi environment
    import gym
    import numpy as np
    import random

    # Environment Setup
    env = gym.make("Taxi-v3")
    env.reset()

    # Q[state, action] table implementation
    Q = np.zeros([env.observation_space.n, env.action_space.n])

    for episode in range(1000):
        done = False
        total_reward = 0
        state = env.reset()
        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  # Explore state space
            else:
                action = np.argmax(Q[state])  # Exploit learned values
            next_state, reward, done, info = env.step(action)  # invoke Gym
            next_max = np.max(Q[next_state])
            old_value = Q[state, action]

            new_value = old_value + alpha * (reward + gamma * next_max - old_value)

            Q[state, action] = new_value
            total_reward += reward
            state = next_state

        if episode % 100 == 0:
            if showReward:
                print(f"Episode {episode} Total Reward: {total_reward}")
    if showQ:
        print(Q)

    return 

- print a live policy

In [2]:
gamma = 0.8 # discount factor{}
alpha = 0.2  # learning rate
epsilon = 0.1  # epsilon greedy

Q_learning(gamma, alpha, epsilon)

Episode 0 Total Reward: -569
Episode 100 Total Reward: -245
Episode 200 Total Reward: -55
Episode 300 Total Reward: -59
Episode 400 Total Reward: 11
Episode 500 Total Reward: -15
Episode 600 Total Reward: 14
Episode 700 Total Reward: 10
Episode 800 Total Reward: -1
Episode 900 Total Reward: -13
[[ 0.          0.          0.          0.          0.          0.        ]
 [-3.19670474 -3.11377344 -3.32553749 -3.19420794 -1.64506284 -5.53472077]
 [-1.61313746 -2.13501361 -2.05284028 -1.42595097  3.1919773  -5.23663549]
 ...
 [-1.33321462 -1.36678377 -1.48131366 -1.50702631 -3.632      -3.66272   ]
 [-2.50206596 -2.4659207  -2.50206596 -2.12406715 -3.90706105 -3.77379377]
 [-0.392      -0.392      -0.392      10.17655029 -2.         -2.        ]]


In [5]:
gamma = 0.8 # discount factor
alpha = 0.2  # learning rate
epsilons = [0.1, 0.01, 0.001]  # epsilon greedy

for epsilon in epsilons:
    print(f"epsilon:{epsilon}")
    Q_learning(gamma, alpha, epsilon, True, False)


epsilon:0.1
Episode 0 Total Reward: -578
Episode 100 Total Reward: 10
Episode 200 Total Reward: -51
Episode 300 Total Reward: -78
Episode 400 Total Reward: 10
Episode 500 Total Reward: -15
Episode 600 Total Reward: -11
Episode 700 Total Reward: -1
Episode 800 Total Reward: 10
Episode 900 Total Reward: -2
epsilon:0.01
Episode 0 Total Reward: -533
Episode 100 Total Reward: -209
Episode 200 Total Reward: -52
Episode 300 Total Reward: -71
Episode 400 Total Reward: -94
Episode 500 Total Reward: 0
Episode 600 Total Reward: 10
Episode 700 Total Reward: 9
Episode 800 Total Reward: -18
Episode 900 Total Reward: 3
epsilon:0.001
Episode 0 Total Reward: -560
Episode 100 Total Reward: -209
Episode 200 Total Reward: -66
Episode 300 Total Reward: -68
Episode 400 Total Reward: -56
Episode 500 Total Reward: -4
Episode 600 Total Reward: -16
Episode 700 Total Reward: 8
Episode 800 Total Reward: 6
Episode 900 Total Reward: 9


In [7]:
gamma = 0.8 # discount factor
alphas = [0.5, 0.2 , 0.1, 0.01]  # learning rate
epsilons = 0.1 # epsilon greedy

for alpha in alphas:
    print(f"alpha:{alpha}")
    Q_learning(gamma, alpha, epsilon, True, False)

alpha:0.5
Episode 0 Total Reward: -596
Episode 100 Total Reward: -142
Episode 200 Total Reward: 10
Episode 300 Total Reward: 11
Episode 400 Total Reward: 3
Episode 500 Total Reward: 12
Episode 600 Total Reward: 4
Episode 700 Total Reward: 5
Episode 800 Total Reward: 10
Episode 900 Total Reward: 10
alpha:0.2
Episode 0 Total Reward: -542
Episode 100 Total Reward: -452
Episode 200 Total Reward: -112
Episode 300 Total Reward: -200
Episode 400 Total Reward: 12
Episode 500 Total Reward: -99
Episode 600 Total Reward: -6
Episode 700 Total Reward: 11
Episode 800 Total Reward: 10
Episode 900 Total Reward: 10
alpha:0.1
Episode 0 Total Reward: -497
Episode 100 Total Reward: -151
Episode 200 Total Reward: -290
Episode 300 Total Reward: -169
Episode 400 Total Reward: -67
Episode 500 Total Reward: -151
Episode 600 Total Reward: -45
Episode 700 Total Reward: -84
Episode 800 Total Reward: -20
Episode 900 Total Reward: -116
alpha:0.01
Episode 0 Total Reward: -524
Episode 100 Total Reward: -245
Episode 2

### E2

In [13]:


def SARSA(gamma, alpha, epsilon, showReward=True):
    import gym
    import numpy as np
    import random
    #Environment Setup
    env = gym.make("Taxi-v3")
    env.reset()
    # Q[state ,action] table implementation
    Q = np.zeros([env.observation_space.n, env.action_space.n]) 


    for episode in range(1000):
        done = False
        total_reward = 0
        current_state = env.reset()
        if random.uniform(0, 1) < epsilon:
            current_action = env.action_space.sample() # Explore state space
        else:
            current_action = np.argmax(Q[current_state]) # Exploit learned values
        while not done:
            next_state, reward, done, info = env.step(current_action)
            # invoke Gym
            if random.uniform(0, 1) < epsilon:
                next_action = env.action_space.sample() # Explore state space
            else:
                next_action = np.argmax(Q[next_state]) # Exploit learned values
            sarsa_value = Q[next_state ,next_action] 
            old_value = Q[current_state ,current_action]
            new_value = old_value + alpha * (reward + gamma * sarsa_value - old_value)
            
            Q[current_state ,current_action] = new_value
            total_reward += reward 
            current_state = next_state 
            current_action = next_action

        if episode % 100 == 0: 
            if showReward:
                print("Episode {} Total Reward: {}".format(episode ,total_reward))

In [15]:
gamma = 0.7 # discount factor
alpha = 0.2 # learning rate
epsilon = 0.1 # epsilon greedy

SARSA(gamma, alpha, epsilon)

Episode 0 Total Reward: -695
Episode 100 Total Reward: -233
Episode 200 Total Reward: -168
Episode 300 Total Reward: -136
Episode 400 Total Reward: -91
Episode 500 Total Reward: 12
Episode 600 Total Reward: -67
Episode 700 Total Reward: -4
Episode 800 Total Reward: -236
Episode 900 Total Reward: -236


In [16]:
gamma = 0.8 # discount factor
alpha = 0.2  # learning rate
epsilons = [0.1, 0.01, 0.001]  # epsilon greedy

for epsilon in epsilons:
    print(f"epsilon:{epsilon}")
    SARSA(gamma, alpha, epsilon)

epsilon:0.1
Episode 0 Total Reward: -695
Episode 100 Total Reward: -263
Episode 200 Total Reward: -134
Episode 300 Total Reward: -181
Episode 400 Total Reward: -200
Episode 500 Total Reward: -2
Episode 600 Total Reward: -134
Episode 700 Total Reward: -28
Episode 800 Total Reward: 11
Episode 900 Total Reward: -54
epsilon:0.01
Episode 0 Total Reward: -758
Episode 100 Total Reward: -209
Episode 200 Total Reward: -14
Episode 300 Total Reward: -39
Episode 400 Total Reward: -2
Episode 500 Total Reward: -13
Episode 600 Total Reward: 6
Episode 700 Total Reward: -44
Episode 800 Total Reward: 5
Episode 900 Total Reward: 0
epsilon:0.001
Episode 0 Total Reward: -740
Episode 100 Total Reward: -200
Episode 200 Total Reward: -103
Episode 300 Total Reward: -27
Episode 400 Total Reward: 1
Episode 500 Total Reward: 9
Episode 600 Total Reward: 4
Episode 700 Total Reward: 11
Episode 800 Total Reward: 13
Episode 900 Total Reward: 0


In [17]:
gamma = 0.8 # discount factor
alphas = [0.5, 0.2 , 0.1, 0.01]  # learning rate
epsilons = 0.1 # epsilon greedy

for alpha in alphas:
    print(f"alpha:{alpha}")
    SARSA(gamma, alpha, epsilon)

alpha:0.5
Episode 0 Total Reward: -740
Episode 100 Total Reward: -200
Episode 200 Total Reward: -99
Episode 300 Total Reward: -67
Episode 400 Total Reward: 2
Episode 500 Total Reward: 7
Episode 600 Total Reward: 9
Episode 700 Total Reward: 6
Episode 800 Total Reward: 7
Episode 900 Total Reward: 8
alpha:0.2
Episode 0 Total Reward: -776
Episode 100 Total Reward: -200
Episode 200 Total Reward: -84
Episode 300 Total Reward: -344
Episode 400 Total Reward: -91
Episode 500 Total Reward: -11
Episode 600 Total Reward: 5
Episode 700 Total Reward: 8
Episode 800 Total Reward: 8
Episode 900 Total Reward: 5
alpha:0.1
Episode 0 Total Reward: -740
Episode 100 Total Reward: -200
Episode 200 Total Reward: -380
Episode 300 Total Reward: -27
Episode 400 Total Reward: -67
Episode 500 Total Reward: -146
Episode 600 Total Reward: -22
Episode 700 Total Reward: 11
Episode 800 Total Reward: 12
Episode 900 Total Reward: -16
alpha:0.01
Episode 0 Total Reward: -731
Episode 100 Total Reward: -299
Episode 200 Total 

### E3

### E4

In [23]:
import gym
env = gym.make('CartPole-v1')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

In [1]:
import gym
import numpy as np

def iterate_value_function(v_inp, gamma, env):
    ret = np.zeros(env.nS)
    for sid in range(env.nS):
        temp_v = np.zeros(env.nA)
        for action in range(env.nA):
            for (prob, dst_state, reward, is_final) in env.P[sid][action]:
                temp_v[action] += prob*(reward + gamma*v_inp[dst_state]*(not is_final))
        ret[sid] = max(temp_v)
    return ret

def build_greedy_policy(v_inp, gamma, env):
    new_policy = np.zeros(env.nS)
    for state_id in range(env.nS):
        profits = np.zeros(env.nA)
        for action in range(env.nA):
            for (prob, dst_state, reward, is_final) in env.P[state_id][action]:
                profits[action] += prob*(reward + gamma*v_inp[dst_state])  # v[dst_state] は v_inp のタイポ?
        new_policy[state_id] = np.argmax(profits)
    return new_policy

env = gym.make('CartPole-v1')

goal_average_steps = 195
max_number_of_steps = 500
num_consecutive_iterations = 100
num_episodes = 100
last_time_steps = np.zeros(num_consecutive_iterations)

for episode in range(num_episodes):
    # 環境の初期化
    observation = env.reset()

    episode_reward = 0
    for t in range(max_number_of_steps):
        # CartPoleの描画
        env.render()
    
        # ランダムで行動の選択
        action = np.random.choice([0, 1])

        # 行動の実行とフィードバックの取得
        observation, reward, done, info = env.step(action)
        episode_reward += reward

        if done:
            print('%d Episode finished after %d time steps / mean %f' % (episode, t + 1,
                last_time_steps.mean()))
            last_time_steps = np.hstack((last_time_steps[1:], [episode_reward]))
            break

    if (last_time_steps.mean() >= goal_average_steps): # 直近の100エピソードが195以上であれば成功
        print('Episode %d train agent successfuly!' % episode)
        break
env.close()

0 Episode finished after 26 time steps / mean 0.000000
1 Episode finished after 14 time steps / mean 0.260000
2 Episode finished after 30 time steps / mean 0.400000
3 Episode finished after 14 time steps / mean 0.700000
4 Episode finished after 21 time steps / mean 0.840000
5 Episode finished after 10 time steps / mean 1.050000
6 Episode finished after 38 time steps / mean 1.150000
7 Episode finished after 14 time steps / mean 1.530000
8 Episode finished after 11 time steps / mean 1.670000
9 Episode finished after 16 time steps / mean 1.780000
10 Episode finished after 20 time steps / mean 1.940000
11 Episode finished after 16 time steps / mean 2.140000
12 Episode finished after 21 time steps / mean 2.300000
13 Episode finished after 20 time steps / mean 2.510000
14 Episode finished after 20 time steps / mean 2.710000
15 Episode finished after 30 time steps / mean 2.910000
16 Episode finished after 21 time steps / mean 3.210000
17 Episode finished after 27 time steps / mean 3.420000
18