<a href="https://colab.research.google.com/github/ACPSYan/Georgia/blob/main/Taxi_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# value iteration


import numpy as np
import gym
from gym import wrappers
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

np.random.seed(42)

def run_episode(env, policy, gamma = 1.0, render = False):
    """ Evaluates policy by using it to run an episode and finding its
    total reward.
    args:
    env: gym environment.
    policy: the policy to be used.
    gamma: discount factor.
    render: boolean to turn rendering on/off.
    returns:
    total reward: real value of the total reward recieved by agent under policy.
    """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward


def evaluate_policy(env, policy, gamma = 1.0,  n = 2000):
    """ Evaluates a policy by running it n times.
    returns:
    average total reward
    """
    scores = [
            run_episode(env, policy, gamma = gamma, render = False)
            for _ in range(n)]
    return scores

def extract_policy(v, gamma = 1.0):
    """ Extract the policy given a value-function """
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for next_sr in env.P[s][a]:
                # next_sr is a tuple of (probability, next state, reward, done)
                p, s_, r, _ = next_sr
                q_sa[a] += (p * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy

def value_iteration(env, gamma = 1.0):
    """ Value-iteration algorithm """
    v = np.zeros(env.nS)  # initialize value-function
    max_iterations = 10000
    eps = 1e-20
    for i in range(max_iterations):
        prev_v = np.copy(v)
        for s in range(env.nS):
            q_sa = [sum([p*(r + prev_v[s_]) for p, s_, r, _ in env.P[s][a]]) for a in range(env.nA)]
            v[s] = max(q_sa)
        if (np.sum(np.fabs(prev_v - v)) <= eps):
            print ('Value-iteration converged at iteration# %d.' %(i+1))
            break
    return v

gamma = [0.001, 0.01, 0.1, 0.2, 0.4, 0.8, 1.0]
policy_average_score = []
time_iteration=[]
for i in gamma:
    print("gamma = " + str(i))
    env_name  = 'Taxi-v3'
    env = gym.make(env_name)
    env.reset()
    start = datetime.now()
    optimal_v = value_iteration(env, i)
    end = datetime.now()
    time_iteration.append((end-start).total_seconds())
    policy = extract_policy(optimal_v, i)
    policy_scores = evaluate_policy(env, policy, i, n=3000)
    policy_average_score.append(np.mean(policy_scores))

    print('Policy average score = ', np.mean(policy_scores))

fig = plt.figure()
ax = fig.add_subplot(111, xlabel = 'gamma', ylabel= 'Time')
ax.plot(gamma, time_iteration, 'o-', color='r')
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111, xlabel = 'gamma', ylabel= 'Policy_Score')
ax.plot(gamma, policy_average_score, 'o-', color='b')
plt.show()


In [None]:
# policy iteration

import numpy as np
import gym
from gym import wrappers
from gym.envs.toy_text.frozen_lake import generate_random_map
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
np.random.seed(42)


def run_episode(env, policy, gamma = 1.0, render = False):
    """ Runs an episode and return the total reward """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward


def evaluate_policy(env, policy, gamma = 1.0, n = 100):
    scores = [run_episode(env, policy, gamma, False) for _ in range(n)]
    return scores

def extract_policy(v, gamma = 1.0):
    """ Extract the policy given a value-function """
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.nA)
        for a in range(env.nA):
            q_sa[a] = sum([p * (r + gamma * v[s_]) for p, s_, r, _ in  env.P[s][a]])
        policy[s] = np.argmax(q_sa)
    return policy

def compute_policy_v(env, policy, gamma=1.0):
    """ Iteratively evaluate the value-function under policy.
    Alternatively, we could formulate a set of linear equations in iterms of v[s]
    and solve them to find the value function.
    """
    v = np.zeros(env.nS)
    eps = 1e-10
    #eps=0.1
    while True:
        prev_v = np.copy(v)
        for s in range(env.nS):
            policy_a = policy[s]
            v[s] = sum([p * (r + gamma * prev_v[s_]) for p, s_, r, _ in env.P[s][policy_a]])
        if (np.sum((np.fabs(prev_v - v))) <= eps):
            # value converged
            break
    return v

def policy_iteration(env, gamma = 1.0):
    """ Policy-Iteration algorithm """
    policy = np.random.choice(env.nA, size=(env.nS))  # initialize a random policy
    max_iterations = 100
    gamma = 1.0
    for i in range(max_iterations):
        old_policy_v = compute_policy_v(env, policy, gamma)
        new_policy = extract_policy(old_policy_v, gamma)
        if (np.all(policy == new_policy)):
            print ('Policy-Iteration converged at step %d.' %(i+1))
            break
        policy = new_policy
    return policy

gamma=1.0
policy_average_score =0
time_iteration=0
print("gamma = " + str(gamma))
env_name  = 'Taxi-v3'
env = gym.make(env_name)
start = datetime.now()
optimal_policy = policy_iteration(env, gamma =1.0)
end = datetime.now()
print((end-start).total_seconds())

scores = evaluate_policy(env, optimal_policy, gamma=1.0)
print('Policy average score = ', np.mean(scores))





In [3]:
# q-learning

import numpy as np
import gym
import random
from datetime import datetime

def main():

    # create Taxi environment
    env = gym.make('Taxi-v3')
    start= datetime.now()

    # initialize q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    qtable = np.zeros((state_size, action_size))

    # hyperparameters
    learning_rate = 0.9
    discount_rate = 0.8
    epsilon = 1
    decay_rate= 0.005

    # training variables
    num_episodes = 1000
    max_steps = 99 # per episode

    # training
    for episode in range(num_episodes):

        # reset the environment
        state = env.reset()
        done = False

        for s in range(max_steps):

            # exploration-exploitation tradeoff
            if random.uniform(0,1) < epsilon:
                # explore
                action = env.action_space.sample()
            else:
                # exploit
                action = np.argmax(qtable[state,:])

            # take action and observe reward
            new_state, reward, done, info = env.step(action)

            # Q-learning algorithm
            qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

            # Update to our new state
            state = new_state

            # if done, finish episode
            if done == True:
                break

        # Decrease epsilon
        epsilon = np.exp(-decay_rate*episode)

    print(f"Training completed over {num_episodes} episodes")
    end= datetime.now()
    print((end-start).total_seconds())
    input("Press Enter to watch trained agent...")

    # watch trained agent
    state = env.reset()
    done = False
    rewards = 0

    for s in range(max_steps):

        print(f"TRAINED AGENT")
        print("Step {}".format(s+1))

        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)
        rewards += reward
        env.render()
        print(f"score: {rewards}")
        state = new_state

        if done == True:
            break

    env.close()


if __name__ == "__main__":
    main()



  and should_run_async(code)
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Training completed over 1000 episodes
1.734853
Press Enter to watch trained agent...
TRAINED AGENT
Step 1


If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


score: -1
TRAINED AGENT
Step 2
score: -2
TRAINED AGENT
Step 3
score: -3
TRAINED AGENT
Step 4
score: -4
TRAINED AGENT
Step 5
score: -5
TRAINED AGENT
Step 6
score: -6
TRAINED AGENT
Step 7
score: -7
TRAINED AGENT
Step 8
score: -8
TRAINED AGENT
Step 9
score: -9
TRAINED AGENT
Step 10
score: -10
TRAINED AGENT
Step 11
score: -11
TRAINED AGENT
Step 12
score: 9
