# Manage Imports

In [1]:
import numpy as np
import gym
import random

# Initialize Environment

In [2]:
env = gym.make("Taxi-v2")

def has_won(reward):
    return reward == 20

# Create Q-Table

Create an $M \times N$ matrix, with 
- $M$ = size of action space
- $N$ = size of state space / observation space

In [3]:
action_space_size = env.action_space.n
observation_space_size = env.observation_space.n

print("Action Space: {}, Observation Space: {}".format(action_space_size, observation_space_size))

qtable = np.zeros((observation_space_size, action_space_size))
#print(qtable)

Action Space: 6, Observation Space: 500


# Setup Hyper Parameters

In [4]:
total_training_episodes = 10000
total_play_episodes = 10000
learning_rate = 0.8
max_steps = 9999

#discounting rate
gamma = 0.95

#exploration rate
min_epsilon = 0.1
max_epsilon = 1.0
epsilon = max_epsilon

#exponential decay rate for exploration probability
decay_rate = 0.0001
print(decay_rate)

0.0001


# Output Setup

In [5]:
render = False
verbosity = 0 # 0=light, 1=medium, 2=heavy, 3=desperate housewifes
log_every_n_episode = 1000

# The Q-Learning Algorithm

In [6]:
total_training_reward = 0
successful_training_episodes = 0
epsilon = max_epsilon

print("Episode\t| Steps survived\t| Epsilon \t| Average Reward\t| Successful Episodes")

for episode in range(total_training_episodes):
    if (verbosity > 1):
        print("===================================")
        print("Starting Episode {}".format(episode))
        
    state = env.reset()
    reward = 0
    done = False

    total_reward = 0
    step = 0
    
    while (not done and step < max_steps):        
        # step 1: choose action (explore or exploit)
        explore = random.random() < epsilon

        # step 2: take action
        action = env.action_space.sample() if explore else np.argmax(qtable[state])
        new_state, reward, done, info = env.step(action)
        
        #if (custom_rewards):
        #    reward = rewards[new_state]
            
        total_reward += reward
        
        if (verbosity > 2):
            print("Step: {}, Epsilon: {}, Observation: {}, Reward: {}".format(new_state, epsilon, new_state, reward))
        
        if (render):
            env.render()
            
        # step 3: update q-table
        # Q'(s, a) := Q(s, a) + lr * (R(s, a) + gamma * max(Q(s', a')) - Q(s, a))
        qtable[state][action] = qtable[state][action] + learning_rate * (reward + gamma * max(qtable[new_state]) - qtable[state, action])
            
        if (verbosity > 2):
            print(qtable)
        
        state = new_state
        step += 1
    
    total_training_reward += total_reward
    
    if (has_won(reward)):
        successful_training_episodes += 1
    
    if (verbosity > 1 or (episode + 1) % log_every_n_episode == 0 or (verbosity > 0 and has_won(reward))):
        print("{}{}\t| {}\t\t\t| {:1.4f}\t| {:1.4f}\t\t| {} ({:3.2f}%){}".format("\033[1m\033[92m" if has_won(reward) else "", episode + 1, step - 1, epsilon, total_training_reward / (episode + 1), successful_training_episodes, successful_training_episodes / (episode + 1) * 100, ", SOLVED!!!\033[0m" if has_won(reward) else ""))
    
    if (verbosity > 0):
        print("Q-Table:")
        print(qtable)
        print("Board on GameOver:")
        env.render()
    
    epsilon = np.clip((1 - decay_rate) * epsilon, min_epsilon, max_epsilon)

Episode	| Steps survived	| Epsilon 	| Average Reward	| Successful Episodes
[1m[92m1000	| 106			| 0.9049	| -669.1570		| 249 (24.90%), SOLVED!!![0m
[1m[92m2000	| 81			| 0.8188	| -541.9055		| 1033 (51.65%), SOLVED!!![0m
[1m[92m3000	| 30			| 0.7409	| -437.9863		| 2004 (66.80%), SOLVED!!![0m
[1m[92m4000	| 47			| 0.6704	| -363.0335		| 3004 (75.10%), SOLVED!!![0m
[1m[92m5000	| 33			| 0.6066	| -309.0734		| 4004 (80.08%), SOLVED!!![0m
[1m[92m6000	| 22			| 0.5489	| -268.2493		| 5004 (83.40%), SOLVED!!![0m
[1m[92m7000	| 14			| 0.4966	| -236.9654		| 6004 (85.77%), SOLVED!!![0m
[1m[92m8000	| 24			| 0.4494	| -212.0327		| 7004 (87.55%), SOLVED!!![0m
[1m[92m9000	| 21			| 0.4066	| -191.6902		| 8004 (88.93%), SOLVED!!![0m
[1m[92m10000	| 20			| 0.3679	| -174.8373		| 9004 (90.04%), SOLVED!!![0m


# Final Q-Table

In [7]:
print("Successful Episodes: {}, Success Rate: {}".format(successful_training_episodes, successful_training_episodes / total_training_episodes))
print(qtable)


Successful Episodes: 9004, Success Rate: 0.9004
[[ 0.          0.          0.          0.          0.          0.        ]
 [ 5.20997639  6.53681725  5.20997639  6.53681725  7.93349184 -2.46318275]
 [ 7.93349184  9.40367562  7.93349184  9.40367562 10.9512375   0.40367562]
 ...
 [10.9512375  12.58025    10.9512375   9.40367562  1.9512375   1.9512375 ]
 [ 5.20997639  6.53681725  5.20997639  6.53681725 -3.79002361 -3.79002361]
 [16.1        14.295      16.1        18.          7.1         7.1       ]]


# Let the Agent Play the Game with our Q-Table now

In [8]:
successful_episodes = 0
consecutive_successes = 0
consecutive_successes_record = 0

for episode in range(total_play_episodes):
    state = env.reset()
    done = False
    step = 0
    
    while (not done):
        action = np.argmax(qtable[state])
        state, reward, done, info = env.step(action)
        step += 1
    
    if (verbosity > 0):
        print("\n{}Episode finished after {} steps. {}".format("\033[1m\033[92m" if has_won(reward) else "", step, "SOLVED!!!\033[0m" if has_won(reward) else ""))
        env.render()
    
    if (has_won(reward)):
        successful_episodes += 1
        consecutive_successes += 1
        
        if (consecutive_successes > consecutive_successes_record):
            consecutive_successes_record = consecutive_successes
    else:
        consecutive_successes = 0

print("Successful Episodes: {}, Success Rate: {:3.2f}%, Most Consecutive Successes: {}".format(successful_episodes, successful_episodes / total_play_episodes * 100, consecutive_successes_record))

Successful Episodes: 10000, Success Rate: 100.00%, Most Consecutive Successes: 10000


# Successfull Q-Tables: