In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from IPython.display import clear_output
from IPython import display
import gym
import pandas as pd
import gym_minigrid

import seaborn as sns
import matplotlib.pylab as plt
import numpy as np
from tqdm import tqdm
from collections import defaultdict, deque
from random import random
from sklearn.preprocessing import MinMaxScaler
from minigrid_wrappers import CoordsObsWrapper, FourDirectionsActionWrapper, RewardWrapper

from matplotlib.colors import Normalize
normalize_qs = Normalize(vmin = -1, vmax = 1)

# Utils

In [None]:
from matplotlib import pyplot as plt
from IPython.display import clear_output
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np


def render(env, mode='rgb'):
    img = env.render(mode=mode, highlight=False)
    plt.imshow(img, interpolation='nearest')
    clear_output(wait=True)
    plt.pause(0.00001)


def draw_triangle(ax, p1=[1, 1], p2=[2, 2.5], p3=[3, 1], c='blue', alpha=0.3):
    X = np.array([p1, p2, p3])
    Y = [c, c, c]
    ax.scatter(X[:, 0], X[:, 1], s=0.001, color=Y[:])
    t1 = plt.Polygon(X[:3, :], color=Y[0], alpha=alpha)
    ax.add_patch(t1)


def draw_square(ax,
                lb_x,
                lb_y,
                size,
                alpha=1,
                l_color='green',  # left
                r_color='yellow',  # right
                b_color='red',  # down
                t_color='blue'):  # up
    tr_x = lb_x+size  # top right x
    tr_y = lb_y+size  # top right y

    md_x = lb_x + size/2  # middle x
    md_y = lb_y + size/2  # middle x

    width = 10

    md_x_plus = md_x + width/2
    md_x_minus = md_x - width/2

    md_y_plus = md_y + width/2
    md_y_minus = md_y - width/2

    colormap = cm.get_cmap('RdYlGn')

    color_right = colormap(r_color)
    color_down = colormap(b_color)
    color_left = colormap(l_color)
    color_up = colormap(t_color)

    draw_triangle(ax,[lb_x+2, md_y], [md_x_minus, md_y_minus+2],
                  [md_x_minus, md_y_plus-2], c=color_left, alpha=alpha)  # left
    draw_triangle(ax,[tr_x-2, md_y], [md_x_plus, md_y_minus+2],
                  [md_x_plus, md_y_plus-2], c=color_right, alpha=alpha)  # right
    draw_triangle(ax,[md_x, tr_y-2], [md_x_minus+2, md_y_plus],
                  [md_x_plus-2, md_y_plus], c=color_down, alpha=alpha)  # down
    draw_triangle(ax,[md_x, lb_y+2], [md_x_minus+2, md_y_minus],
                  [md_x_plus-2, md_y_minus], c=color_up, alpha=alpha)  # up

In [None]:
def plot_return(ax, returns):
    ax.set_xlabel('Episodes')
    ax.set_ylabel('Return')
    ax.set_title('Return versus Episodes')
    ax.set_ylim(-2.5,1.5)
    ax.plot(np.array(returns), linestyle='dashed', color='silver', lw=1.5)
    ax.plot(pd.Series(returns).rolling(window=10).mean().values, lw=2)

def plot_epsilon(ax, epsilon_history):
    ax.set_ylim(0, 1)
    ax.set_xlabel('Episodes')
    ax.set_ylabel('Epsilon')
    ax.set_title('Epsilon versus Episodes')
    ax.plot(epsilon_history)

def plot_q_value_func(ax, table, env, alpha = 1, img = None):
    df_table = pd.DataFrame(table).T.reset_index()
    df_table.columns = ['y', 'x', 'right', 'down', 'left', 'up']
    df_table = df_table.sort_values(by=['y','x'])
    df_table['lb_x'] = df_table['x']*32
    df_table['lb_y'] = df_table['y']*32

    df_table[['right', 'down', 'left', 'up']] = normalize_qs(df_table[['right', 'down', 'left', 'up']])
    if img is None:
        img = env.grid.render(tile_size=32)
    ax.imshow(img, interpolation='nearest')
    for row in df_table.itertuples():             
        draw_square(ax,
                    row[-2], # lb_x
                    row[-1], # lb_y
                    32, 
                    alpha=alpha, 
                    r_color=row[3], # right
                    b_color=row[4], # down
                    l_color=row[5], # left
                    t_color=row[6]) # up

# Getting Started with OpenAI Gym

## Interface

### Attributes
`action space`

`observation space`

### Methods
`def step`

`def reset`

`def render`

### Wrappers

## Gym Minigrid 
https://github.com/maximecb/gym-minigrid

## Example Minigrid Environments
<ol>
    <li>MiniGrid-Empty-8x8-v0</li>
    <li>MiniGrid-DistShift1-v0</li>
    <li>MiniGrid-DistShift2-v0</li>
    <li>MiniGrid-LavaCrossingS9N3-v0</li>
</ol>

Actions in the basic Minigrid environment:
<ol>
<li>Turn left</li>
<li>Turn right</li>
<li>Move forward</li>
<li>Pick up an object</li>
<li>Drop the object being carried</li>
<li>Toggle (open doors, interact with objects)</li>
<li>Done (task completed, optional)</li>
</ol>

In [None]:
env = gym.make("MiniGrid-DistShift1-v0") # create environment

In [None]:
env.action_space # action space of the environment

In [None]:
env.observation_space # observation space of the environment

In [None]:
env = RewardWrapper(env)
env = CoordsObsWrapper(env)
env = FourDirectionsActionWrapper(env)

In [None]:
import time
obs = env.reset() # reset and get an initial observation

action = 0
for _ in range(1000):
    action = env.action_space.sample() # sample a random action from the action space
    obs, reward, done, _ = env.step(action) # perform the action and receive an observation, reward, and if the episode is done
    render(env) # render the environment
    if done: # if the episode has terminated
        env.reset() # reset the environment
#     time.sleep(0.5)

# Policy Evaluation

## Monte Carlo Policy Evaluation

TODO: explain monte carlo policy evaluation.
add notes, pictures, equations

In [None]:
import warnings
warnings.filterwarnings('ignore')


def MCPE(episodes, env):
    returns = np.zeros((env.height, env.width))
    v_count = np.zeros((env.height, env.width))
    value_function = np.ones((env.height, env.width))*-2

    for i in range(episodes):
        rewards = []
        states = []
        state = env.reset()
        done = False

        while not done:
            states.append(state)
            state, reward, done, _ = env.step(env.action_space.sample())
            rewards.append(reward)

        G = 0
        for state, reward in zip(states[::-1], rewards[::-1]):
            G += reward
            v_count[state] += 1
            returns[state] += G

        if (i+1) % 50 == 0:
            clear_output(wait=True)
            fig, ax = plt.subplots(figsize=(8,5))
            ax = sns.heatmap(value_function, cmap='RdYlGn', alpha = 0.8, zorder=2, annot = True, fmt='.3f')
            img = env.grid.render(tile_size=32)
            ax.imshow(img,
                      aspect = ax.get_aspect(),
                      extent = ax.get_xlim() + ax.get_ylim(),
                      interpolation='nearest', zorder = 1)
            plt.pause(0.00001)

        value_function = np.round(returns/v_count, 3)

    return np.round(returns/v_count, 3)

In [None]:
env = gym.make("MiniGrid-DistShift2-v0") # create environment
env = CoordsObsWrapper(env) # wrap to get coordinates of agent as observation
env = FourDirectionsActionWrapper(env) # wrap to simplify action space (4 directional movement)
value_function = MCPE(2000, env)

## Epsilon Greedy

In [None]:
def e_greedy(env, qtable, state, epsilon):
    if random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(qtable[state])

## Monte Carlo Control

In [None]:
from collections import namedtuple
transition = namedtuple('transition', 'state action')

def MC_Control(episodes, env):
    q_table = defaultdict(lambda: np.zeros(shape=(env.action_space.n,)))
    returns = defaultdict(lambda: defaultdict(lambda: []))

    for ep in tqdm(range(episodes)):
        episode = []
        rewards = []
        state = env.reset()
        done = False
        while not done:            
            action = e_greedy(env, q_table, state, 0.1)
            episode.append((state, action))
            next_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            state = next_state
        G = 0
        for (state, action), reward in zip(episode[::-1], rewards[::-1]):
            G += reward
            returns[state][action].append(G)
            q_table[state][action] = np.mean(returns[state][action])
        if ep % 100 == 0:
            clear_output(wait = True)
            fig, ax = plt.subplots()
            plot_q_value_func(ax, q_table, env)
            plt.pause(0.00001)
    return q_table

In [None]:
env = gym.make("MiniGrid-DistShift2-v0") # create environment
env = CoordsObsWrapper(env) # wrap to get coordinates of agent as observation
env = FourDirectionsActionWrapper(env) # wrap to simplify action space (4 directional movement)
q_table = MC_Control(2000, env)

In [None]:
def demo(epsilon, env, qtable):
    done = False
    state = env.reset()
    for i in range(5):
        print(i)
        while not done:
            action = e_greedy(env, qtable, state,epsilon)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            render(env)
        state = env.reset()
        done = False
demo(0, env, q_table)

In [None]:
mylist = [1,2,3,4,5,6]
print(mylist)
print(mylist[:-1])
print(mylist[::-1])

## Temporal Difference Policy Evaluation

# Control

In [None]:


class Agent:
    """
    Base Agent class
    """
    def __init__(self,
                 env_name,
                 learning_rate=0.1,
                 eps_start = 1,
                 eps_end= 0.1,
                 eps_end_episode = 100,
                 episodes=1000,
                 gamma=0.95):
        self.action_space = ['right', 'down', 'left', 'up']
        self.env_name = env_name
        self.env = self.create_env(env_name)
        self.learning_rate = learning_rate
        self.mode = 'rgb'
        self.size = 32
        
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_end_episode = eps_end_episode        
        
        self.gamma = gamma
        self.episodes = episodes
        num_actions = self.env.action_space.n
        self.table = defaultdict(lambda: np.zeros(shape=(num_actions,)))
        
        self.episode_return_history = []
        self.epsilon_history = []
        
    def get_epsilon(self, episode):
        return max(self.eps_end, 
                    ((self.eps_end - self.eps_start) / self.eps_end_episode)*episode 
                    + self.eps_start)
        
    def create_env(self, env_name):
        env = gym.make(env_name)
        env = RewardWrapper(env)
        env = CoordsObsWrapper(env)
        env = FourDirectionsActionWrapper(env)
        return env        

    def e_greedy(self, state, epsilon):
        if random() < epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.table[state])

    def train(self):
        raise NotImplementedError
        
    def log_metrics(self, episode_return, epsilon):
        self.episode_return_history.append(episode_return)
        self.epsilon_history.append(epsilon)
        
    def plot(self):
        clear_output(wait = True)   
        plt.figure(figsize=(15,10)) 
        ax1 = plt.subplot(2, 2, 1)
        ax2 = plt.subplot(2, 2, 2)
        ax3 = plt.subplot(2,1,2)        
        plot_return(ax1, self.episode_return_history)
        plot_epsilon(ax2, self.epsilon_history)
        plot_q_value_func(ax3, self.table, self.env)        
        plt.pause(0.000001)
        
    def demo_plot(self, state):
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2,figsize=(15,5), gridspec_kw={'width_ratios': [2, 1]}) 
        img = self.env.render(mode='rgb', highlight=False)
        plot_q_value_func(ax1, self.table, self.env, alpha = 0.4, img = img)
        plt.ylim(-0.1, 1)
        
        ax2.set_ylabel('Q-Values')
        ax2.set_title('Q-Values')
        sns.barplot(x=['right', 'down', 'left', 'up'], y=self.table[state], ax = ax2)
        clear_output(wait = True)
        plt.pause(0.5)

    def demo(self, epsilon, random_start = True):
        done = False
        state = self.env.reset(random_start = True)
        for i in range(5):
            print(i)
            while not done:
                self.demo_plot(state)
                action = self.e_greedy(state,epsilon)
                next_state, reward, done, _ = self.env.step(action)
                state = next_state
            state = self.env.reset(random_start = True)
            done = False

## SARSA

Training a SARSA agent

TODO: add some notes, pictures, equations of SARSA

In [None]:
class AgentSarsa(Agent):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def update_sarsa(self, action, state, next_state, next_action, reward, done):
        v_estimate = 0 if done else self.gamma*self.table[next_state][next_action]        
        self.table[state][action] += self.learning_rate*(
            reward + v_estimate - self.table[state][action])

    def train(self):
        done = False
        state = self.env.reset()
        action = self.e_greedy(state, self.get_epsilon(0))

        episode_return = 0
        for ep in range(self.episodes):
            epsilon = self.get_epsilon(ep)
            while not done:
                next_state, reward, done, _ = self.env.step(action)
                episode_return += reward
                if not done:
                    next_action = self.e_greedy(next_state, epsilon)
                self.update_sarsa(action, state, next_state,
                                  next_action, reward, done)
                state = next_state
                action = next_action
            state = self.env.reset()
            action = self.e_greedy(state, epsilon)
            done = False
            
            self.log_metrics(episode_return, epsilon)
            episode_return = 0
            
            if ep % 10 == 0:
                self.plot()

In [None]:
agent_sarsa = AgentSarsa(env_name='MiniGrid-DistShift1-v0', episodes=100) #
agent_sarsa.train()

In [None]:
agent_sarsa.demo(0)

## Q-learning

Training a Q-learning agent

//TODO:
add some notes and theory about Q-learning

In [None]:
class AgentQL(Agent):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def update_q_table(self, action, state, next_state, reward, done):
        v_estimate = 0 if done else self.gamma*np.max(self.table[next_state])
        self.table[state][action] += self.learning_rate*(
            reward + v_estimate - self.table[state][action])

    def train(self):
        done = False
        state = self.env.reset()
        step = 0
        episode_return = 0
        for ep in range(self.episodes):
            epsilon = self.get_epsilon(ep)
            while not done:
                action = self.e_greedy(state, epsilon)
                next_state, reward, done, _ = self.env.step(action)
                episode_return += reward
                self.update_q_table(action, state, next_state, reward, done)
                state = next_state
                step += 1
            state = self.env.reset()
            done = False
            
            self.log_metrics(episode_return, epsilon)
            episode_return = 0
            
            if ep % 10 == 0:
                self.plot()

In [None]:
agent_q = AgentQL(env_name='MiniGrid-DistShift1-v0')
agent_q.train()

In [None]:
agent_q.demo(0)

In [None]:
dis_df = pd.DataFrame({'SARSA': agent_sarsa.episode_return_history, 'Q-Learning': agent_q.episode_return_history})
sns.displot(dis_df, kind='kde')
plt.show()