In [1]:
!pip install cmake gym scipy



DEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.


In [36]:
import pandas as pd
import seaborn as sns
import itertools
import matplotlib
import numpy as np
import sys
import sklearn.pipeline
import sklearn.preprocessing
from sklearn.preprocessing import LabelBinarizer


In [37]:
import sys
from contextlib import closing
from io import StringIO
from gym import utils
from gym.envs.toy_text import discrete
import numpy as np

MAP = [
    "+---------+",
    "|R: | : :G|",
    "| : | : : |",
    "| : : : : |",
    "| | : | : |",
    "|Y| : |B: |",
    "+---------+",
]


# taxi_row, taxi_col, pass_idx, dest_idx = 3, 
# taxi_row, taxi_col, pass_idx, dest_idx = self.decode(self.s)


class TaxiEnv(discrete.DiscreteEnv):
    """
    The Taxi Problem
    from "Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition"
    by Tom Dietterich
    Description:
    There are four designated locations in the grid world indicated by R(ed), G(reen), Y(ellow), and B(lue). When the episode starts, the taxi starts off at a random square and the passenger is at a random location. The taxi drives to the passenger's location, picks up the passenger, drives to the passenger's destination (another one of the four specified locations), and then drops off the passenger. Once the passenger is dropped off, the episode ends.
    Observations:
    There are 500 discrete states since there are 25 taxi positions, 5 possible locations of the passenger (including the case when the passenger is in the taxi), and 4 destination locations. 
    Passenger locations:
    - 0: R(ed)
    - 1: G(reen)
    - 2: Y(ellow)
    - 3: B(lue)
    - 4: in taxi
    Destinations:
    - 0: R(ed)
    - 1: G(reen)
    - 2: Y(ellow)
    - 3: B(lue)
    Actions:
    There are 6 discrete deterministic actions:
    - 0: move south
    - 1: move north
    - 2: move east
    - 3: move west
    - 4: pickup passenger
    - 5: drop off passenger
    Rewards:
    There is a default per-step reward of -1,
    except for delivering the passenger, which is +20,
    or executing "pickup" and "drop-off" actions illegally, which is -10.
    Rendering:
    - blue: passenger
    - magenta: destination
    - yellow: empty taxi
    - green: full taxi
    - other letters (R, G, Y and B): locations for passengers and destinations
    state space is represented by:
        (taxi_row, taxi_col, passenger_location, destination)
    """
    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self):
        self.desc = np.asarray(MAP, dtype='c')

        self.locs = locs = [(0, 0), (0, 4), (4, 0), (4, 3)]

        num_states = 500
        num_rows = 5
        num_columns = 5
        max_row = num_rows - 1
        max_col = num_columns - 1
        initial_state_distrib = np.zeros(num_states)
        num_actions = 6
        P = {state: {action: []
                     for action in range(num_actions)} for state in range(num_states)}
        for row in range(num_rows):
            for col in range(num_columns):
                for pass_idx in range(len(locs) + 1):  # +1 for being inside taxi
                    for dest_idx in range(len(locs)):
                        state = self.encode(row, col, pass_idx, dest_idx)
                        if pass_idx < 4 and pass_idx != dest_idx:
                            initial_state_distrib[state] += 1
                        for action in range(num_actions):
                            # defaults
                            new_row, new_col, new_pass_idx = row, col, pass_idx
                            reward = -1  # default reward when there is no pickup/dropoff
                            done = False
                            taxi_loc = (row, col)

                            if action == 0:
                                new_row = min(row + 1, max_row)
                            elif action == 1:
                                new_row = max(row - 1, 0)
                            if action == 2 and self.desc[1 + row, 2 * col + 2] == b":":
                                new_col = min(col + 1, max_col)
                            elif action == 3 and self.desc[1 + row, 2 * col] == b":":
                                new_col = max(col - 1, 0)
                            elif action == 4:  # pickup
                                if (pass_idx < 4 and taxi_loc == locs[pass_idx]):
                                    new_pass_idx = 4
                                else:  # passenger not at location
                                    reward = -10
                            elif action == 5:  # dropoff
                                if (taxi_loc == locs[dest_idx]) and pass_idx == 4:
                                    new_pass_idx = dest_idx
                                    done = True
                                    reward = 20
                                elif (taxi_loc in locs) and pass_idx == 4:
                                    new_pass_idx = locs.index(taxi_loc)
                                else:  # dropoff at wrong location
                                    reward = -10
                            new_state = self.encode(
                                new_row, new_col, new_pass_idx, dest_idx)
                            P[state][action].append(
                                (1.0, new_state, reward, done))
        initial_state_distrib /= initial_state_distrib.sum()
        discrete.DiscreteEnv.__init__(
            self, num_states, num_actions, P, initial_state_distrib)

    def encode(self, taxi_row, taxi_col, pass_loc, dest_idx):
        # (5) 5, 5, 4
        i = taxi_row
        i *= 5
        i += taxi_col
        i *= 5
        i += pass_loc
        i *= 4
        i += dest_idx
        return i

    def decode(self, i):
        out = []
        out.append(i % 4)
        i = i // 4
        out.append(i % 5)
        i = i // 5
        out.append(i % 5)
        i = i // 5
        out.append(i)
        assert 0 <= i < 5
        return reversed(out)

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout

        out = self.desc.copy().tolist()
        out = [[c.decode('utf-8') for c in line] for line in out]
        taxi_row, taxi_col, pass_idx, dest_idx = self.decode(self.s)

        def ul(x): return "_" if x == " " else x
        if pass_idx < 4:
            out[1 + taxi_row][2 * taxi_col + 1] = utils.colorize(
                out[1 + taxi_row][2 * taxi_col + 1], 'yellow', highlight=True)
            pi, pj = self.locs[pass_idx]
            out[1 + pi][2 * pj + 1] = utils.colorize(out[1 + pi][2 * pj + 1], 'blue', bold=True)
        else:  # passenger in taxi
            out[1 + taxi_row][2 * taxi_col + 1] = utils.colorize(
                ul(out[1 + taxi_row][2 * taxi_col + 1]), 'green', highlight=True)

        di, dj = self.locs[dest_idx]
        out[1 + di][2 * dj + 1] = utils.colorize(out[1 + di][2 * dj + 1], 'magenta')
        outfile.write("\n".join(["".join(row) for row in out]) + "\n")
        if self.lastaction is not None:
            outfile.write("  ({})\n".format(["South", "North", "East", "West", "Pickup", "Dropoff"][self.lastaction]))
        else:
            outfile.write("\n")

        # No need to return anything for human
        if mode != 'human':
            with closing(outfile):
                return outfile.getvalue()
            

In [45]:
import gym
env = TaxiEnv()
state = env.encode(2, 2, 0, 3)
print("State:", state)
env.s = state
env.render()

State: 243
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+



In [46]:
labels = [0, 1, 2, 3, 4, 5]
lb = LabelBinarizer()
lb.fit(labels)
lb.transform([1])

array([[0, 1, 0, 0, 0, 0]])

In [132]:
numEpisodes = 1000
discountFactor = 1
alpha = 0.01
nA = env.action_space.n

w = np.zeros((nA,100))
epRewards = np.zeros(numEpisodes)

In [133]:
epRewards.shape

(1000,)

In [134]:
w.shape

(6, 100)

In [135]:
env.observation_space.sample()

71

In [136]:
state = 243 # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 243
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+



In [137]:
observation_examples = np.array([list(env.decode(env.observation_space.sample())) for x in range(1000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

StandardScaler()

In [138]:
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

StandardScaler()

In [139]:
featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=20)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=20)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=20)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=20)),
        ("rbf5", RBFSampler(gamma=0.5, n_components=20)),
        ])

featurizer.fit(scaler.transform(observation_examples))

FeatureUnion(transformer_list=[('rbf1', RBFSampler(gamma=5.0, n_components=20)),
                               ('rbf2', RBFSampler(gamma=2.0, n_components=20)),
                               ('rbf3', RBFSampler(n_components=20)),
                               ('rbf4', RBFSampler(gamma=0.5, n_components=20)),
                               ('rbf5',
                                RBFSampler(gamma=0.5, n_components=20))])

In [140]:
observation_examples

array([[1, 3, 3, 0],
       [2, 4, 1, 0],
       [3, 0, 4, 3],
       ...,
       [0, 1, 3, 1],
       [4, 4, 4, 1],
       [1, 2, 4, 3]])

In [125]:
# state_feature = list(env.decode(232))
# # state_feature
# scaler.transform([state_feature])
# A = featurizer.transform(scaler.transform([state_feature]))
# A.shape

(1, 500)

In [None]:
featurizer

In [141]:
def featurize_state(state):
    scaled = scaler.transform([state])
    #featurized = featurizer.transform(scaled)
    return scaled

def policy(state, weight, epsilon=0.5):
    A = np.ones(nA,dtype=float) * epsilon/nA
    best_action =  np.argmax([state.dot(w[a]) for a in range(nA)])
    A[best_action] += (1.0-epsilon)
    sample = np.random.choice(nA,p=A)
    return sample

In [142]:
env.observation_space.sample()

124

In [None]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# For plotting metrics
all_epochs = []
all_penalties = []
all_rewards = []

for e in range(numEpisodes):
    state = env.reset()

    epochs, penalties, cum_reward, = 0, 0, 0
    done = False
    
    count = 0 
    while not done:
        
        state_feature = list(env.decode(state))
        state_feature = featurize_state(state_feature)
        action = policy(state_feature,w)

        next_state, reward, done, info = env.step(action) 
        next_state_feature = list(env.decode(next_state))
        next_state_feature = featurize_state(next_state_feature)
        
        next_action = policy(next_state_feature,w)
        
        epRewards[e] += reward
        
        target = reward + discountFactor * next_state_feature.dot(w[next_action])
        td_error = state_feature.dot(w[action]) - target

        dw = (td_error).dot(state_feature)

        state = next_state
        
        w[action] -= alpha * dw

        cum_reward += reward
        epochs += 1
        
        count +=1
        if count >=5000:
            break
        
    all_epochs.append(epochs)
    all_penalties.append(penalties)
    all_rewards.append(cum_reward)
    
    clear_output(wait=True)
    print("Episode: {}".format(e))

print("Training finished.\n")


Episode: 2


In [144]:
data_tuples = list(zip(list(range(1, 999)), all_epochs, all_rewards))
learning_df = pd.DataFrame(data_tuples, columns=['episode','epochs_to_complete', 'rewards'])
learning_df

Unnamed: 0,episode,epochs_to_complete,rewards
0,1,9168,-22845
1,2,2772,-6297
2,3,16666,-41350
3,4,3184,-7663
4,5,11708,-29174
5,6,10233,-25044
6,7,3345,-8346
7,8,5520,-13716
8,9,61607,-153287
9,10,589,-1783


In [95]:
w

array([[-1.10152032e-01, -9.71559971e-01,  5.85536051e+00,
        -5.99425533e+02],
       [-1.31932947e-01, -9.87866252e-01,  5.80903221e+00,
        -5.99468542e+02],
       [-1.43683196e-01, -9.93196132e-01,  5.75525249e+00,
        -5.99705071e+02],
       [-1.35183773e-01, -9.38246897e-01,  5.91348655e+00,
        -5.99483197e+02],
       [-4.60403215e-01, -5.82930322e-01,  4.67784438e+00,
        -6.03225298e+02],
       [ 7.83535948e-01, -1.21879389e+00,  6.21780156e+00,
        -6.03082166e+02]])

In [97]:
epochs = 0
penalties, reward = 0, 0

frames = [] # for animation
rewards = []

state = 243
env.s = state

done = False

while not done:
    
    state_feature = list(env.decode(state))
    state_feature = featurize_state(state_feature)
    action = policy(state_feature,w)
    
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )
    rewards.append(reward)

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))


KeyboardInterrupt: 

In [96]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print("Timestep: {}".format(i + 1))
        print("State: {}".format(frame['state']))
        print("Action: {}".format(frame['action']))
        print("Reward: {}".format(frame['reward']))
        sleep(0.1)
        
print_frames(frames)

NameError: name 'frames' is not defined

In [63]:
featurize_state([state])

array([[-0.11456669,  0.28164057, -0.09517834,  0.19615317, -0.269186  ,
        -0.12565423,  0.27832016,  0.25468262, -0.17450584,  0.28181043,
         0.28269382,  0.27171563, -0.12153166, -0.12417229,  0.24794124,
        -0.26318235,  0.17458221,  0.28274556,  0.10750719, -0.2749872 ,
        -0.27676149,  0.06463112,  0.17141615, -0.27738014,  0.05040022]])