# Solving TAXI Environment using QTable

## Imports

In [None]:
# Full imports
import gym
import collections
import os
import datetime

# Aliased imports
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Partial Import
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from tqdm.notebook import tqdm, trange
from IPython.display import clear_output


In [5]:
from tqdm.notebook import tqdm, trange

for _ in trange(1000): pass

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

## Bootstrap

In [None]:
# Remember to export to export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/"" if using linux
# Drop numa errors in term: "for a in /sys/bus/pci/devices/*; do echo 0 | sudo tee -a $a/numa_node; done"


# We have GPU
tf.config.list_physical_devices('GPU')

In [None]:
# Make videos folder
# Install asciinema: "apt-get install asciinema"
%mkdir videos
%rm -rf ./logs
%mkdir logs

%load_ext tensorboard

## QLearning

### Intro

We have the a grid of 5x5, where the pick-up and drop-off locations might be {R, G, B, Y}. Since the client might be a passanger, we need to account with an additional state:
* States of the taxi position: $5\times5$
* States of the drop-off locations: $4$
* States of the pick-up/passenger: $4$ (R, G, B, Y) + $1$ (on taxi) 

Then we have $5\times5\times5\times(4+1)=500$ total states.

### DQN

Implementation of a basic DQN model using experience replay and target network:
* **Experience Replay**: Updating the NN in an online manner using the sequential states introduces correlation in the training process, making it unstable. To reduce  this problem we'll save past experiencias and update the network in random minibatches, allowing us to revisit rare occurences and learn more from individual experiences.

* **Target Network**: Directly updating Q'(s, a) using as the "real" value (Q(s, a)) the one provided by the Bellman equation is risky. Even thought there's just one step of difference using the Bellman equation and thus making the estimation relatively accurate, distinguishing between Q(s, a) and the Bellman estimation proves difficult for a NN. Finally, altering the value of Q(s, a) might indirectly affect the value of Q(s', a'). To address the issue we'll use a copy of the main NN that's we'll use to compute Q(s', a') and update periodically.

#### Constants

In [None]:
DEFAULT_ENV = "Taxi-v3"
BATCH_SIZE = 32

LEARNING_RATE = 1e-4
GAMMA = 0.99
EPS_START = 1.0
EPS_DECAY = 999985
EPS_MIN = 0.02

REPLAY_BUFFER_SIZE = 10000
TRAINING_BATCH_SIZE = 32
SYNC_TARGET_FRAMES = 1000


#### Environment

In [None]:

env = gym.make(DEFAULT_ENV)
env.reset()
env.step(3)

#### Model

In [None]:
# Define basic model using the embedding layer
# We have discrete values
dqn = tf.keras.Sequential([
    # Input
    tf.keras.layers.Embedding(env.observation_space.n, env.action_space.n, input_length=1),

    tf.keras.layers.Dense(32),

    # Output
    tf.keras.layers.Dense(6),
    tf.keras.layers.Flatten()
])

# Print model summary
print(dqn.summary())

In [None]:
# Define experience
Experience = collections.namedtuple("Experience", field_names=["state", "action", "reward", "new_state", "done"])

# Define deque using experince structure
class ExperienceReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer = collections.deque(maxlen=buffer_size)
        self.max_buffer_size = buffer_size
    
    def __len__(self):
        return len(self.buffer)

    def append(self, state, action, reward, new_state, done):
        self.buffer.append(Experience(state, action, reward, new_state, done))
    
    def sample(self, batch_size):
        # Get indexes
        idx = np.random.choice(len(self.buffer), batch_size, replace=False)

        # Get values
        return list(zip(*[self.buffer[i] for i in idx]))

In [None]:
from functools import reduce


class Agent:
    def __init__(self, env, model, lr=0.001, gamma=0.7, eps_i=1, eps_f=0.02, eps_d=0.9998, batch_size=32, buffer_size=10000, sync_target_frames=1000):
        # Save env
        self.env = env

        # Save model
        self.model = model

        # Compile it using the adam
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        #self.model.compile(optimizer=self.optimizer , loss="mse")

        # Define target model
        self.target_model = tf.keras.models.clone_model(model)

        # Init experience buffer
        self.buffer = ExperienceReplayBuffer(buffer_size)

        # Def loss
        self.loss_fn = mse = tf.keras.losses.MeanSquaredError()

        # Init sate
        self._reset()

        # Save hyperparams
        self.lr = lr
        self.gamma = gamma
        self.eps = self.eps_i = eps_i
        self.eps_f = eps_f
        self.eps_d = eps_d
        self.batch_size = batch_size
        self.sync_target_frames = sync_target_frames

    def _reset(self):
        self.state, _ = self.env.reset()

    def step(self):
        # Decay epsilon
        # This will allow us to transition from exploration to explotaton as the model
        # model performs better
        self.eps = max(self.eps * self.eps_d, self.eps_f)

        # Perform action using e-greedy
        if np.random.random() < self.eps:
            action = env.action_space.sample()
        else:
            available_actions = self.model(np.array(self.state).reshape(-1, 1))
            action = np.argmax(available_actions)

        # Get new action
        data = self.env.step(action)
        state_new, reward, done, _, _ = data

        # Save experience 
        self.buffer.append(self.state, action, reward, state_new, done)

        # Update state
        self.state = state_new

        # If we have reached our goal
        # reset the goal
        if done:
            self._reset()
        
        return reward, done

    @tf.function
    def _train_step(self, curr_states, actions, next_states, rewards, done_mask):
        with tf.GradientTape() as tape:
            # Get Q Values for performed action
            curr_q_values_all = self.model(curr_states)
            curr_q_values_sel = tf.gather(curr_q_values_all, actions, batch_dims=1)

            # Get max Q from target network
            next_q_values_all = tf.stop_gradient(self.target_model(next_states))
            next_q_values_max = tf.reduce_max(next_q_values_all, axis=1)

            # Mask done actions
            next_q_values_max = tf.where(done_mask, tf.constant(0, dtype=float), next_q_values_max)
            
            # Compute expected Q values
            
            expected_q = tf.constant(self.gamma) * next_q_values_max + rewards
            
            # Compute loss
            loss = self.loss_fn(curr_q_values_sel, expected_q)
            
            # Update weights
            grads = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
            
        
    def train(self, sync=False):
        # If we don't have enough we don't train
        if len(self.buffer) < self.buffer.max_buffer_size:
            return

        # Get sample from batch
        sample = np.array(self.buffer.sample(self.batch_size)).T

        # As tensors
        current_states = tf.constant(sample[:, 0])
        next_states = tf.constant(sample[:, 3])
        actions = tf.constant(sample[:, 1])
        rewards = tf.constant(sample[:, 2], dtype=float)
        done_mask = tf.constant(sample[:, 4] == 1)

        # Train
        self._train_step(current_states, actions, next_states, rewards, done_mask)

        # Sync with target
        if sync:
            self.target_model.set_weights(self.model.get_weights())  

In [None]:
# Constants
DEFAULT_ENV = "Taxi-v3"
BATCH_SIZE = 128

LEARNING_RATE = 1e-3 
GAMMA = 0.7
EPS_START = 1.0
EPS_DECAY = 0.999965
EPS_MIN = 0.02

REPLAY_BUFFER_SIZE = 10000
SYNC_TARGET_FRAMES = 200


# Agent
agent = Agent(env, dqn, LEARNING_RATE, GAMMA, EPS_START, EPS_MIN, EPS_DECAY, BATCH_SIZE, REPLAY_BUFFER_SIZE)
n_frames = 0
stats = {}

for e in (tbar := trange(10000)):
    done = False
    rewards = []

    while not done:
        # Take step
        n_frames += 1
        r, done = agent.step()

        # Add rewards
        rewards.append(r)

        # Train
        agent.train(sync=(n_frames % SYNC_TARGET_FRAMES == 0))

    # Update bar
    tbar.set_description(f"Mean reward: {np.array(rewards).mean(): 0.3}, Epsilon: {agent.eps: 0.3}")
    tbar.refresh()

    # Update stats
    stats[e] = np.array(rewards).mean()



In [None]:
stats_pd = pd.DataFrame.from_dict(stats, orient="index")
stats_pd.plot()