## Set up

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
sns.set_style("darkgrid")
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [9]:
# Windy grid
rewardSize = -1
gamma = 0.95 # discount rate

alpha = 0.85 # learning rate
epsilon = 0.9 # e-greedy 
episodeNum = 100 # number of episodes 
maxSteps = 100

In [10]:
import gym
import gym_gridworlds
from lib.envs.windy_gridworld import WindyGridworldEnv

env = WindyGridworldEnv()

### Utilities

In [11]:
#Function to choose the next action 
@deprecate
def choose_action(state, Q): 
    action=None
    if np.random.uniform(0, 1) < epsilon: 
        action = env.action_space.sample()
    else: 
        action = np.argmax(Q[state, :]) 
    return action 

def createEpsilonGreedyPolicy(Q, epsilon, num_actions): 
    """ 
    Creates an epsilon-greedy policy based 
    on a given Q-function and epsilon. 
       
    Returns a function that takes the state 
    as an input and returns the probabilities 
    for each action in the form of a numpy array  
    of length of the action space(set of possible actions). 
    """
    def policyFunction(state): 
   
        Action_probabilities = np.ones(num_actions, 
                dtype = float) * epsilon / num_actions 
                  
        best_action = np.argmax(Q[state]) 
        Action_probabilities[best_action] += (1.0 - epsilon) 
        return Action_probabilities 
   
    return policyFunction 

## Monte Carlo

![title](img/montecarlo.png)

In [12]:
def Monte_Carlo():
    #Initializing the Q-matrix and Returns matrix
    Q = np.zeros((env.observation_space.n, env.action_space.n)) 
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
    
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    for i in range(episodeNum):
        episode = [] # current episode
        state = env.reset()
        for t in range(maxSteps):
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p = action_probs) 
            
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
            
            sa_in_episode = set([(tuple(x[0]), x[1]) for x in episode])
            
    for state, action in sa_in_episode:
        sa_pair = (state, action)
        first_occur_index = next(i for i, x in enumerate(episode) if (x[0] == state and x[1] == action))
        G = np.sum([gamma**i * x[2] for i,x in enumerate(episode[first_occur_index:])])
        returns_sum[sa_pair] += G
        returns_count[sa_pair] += 1.0
        
        Q[state][action] = returns_sum[sa_pair]/returns_count[sa_pair]
                
    return Q, policy
            

## SARSA(0)

In [13]:
def SARSA_0():
    #Initializing the Q-matrix
    Q = np.zeros((env.observation_space.n, env.action_space.n)) 
    #Initializing the reward  
    reward=0

    # Starting the SARSA learning 
    for episode in range(episodeNum): 
        t = 0
        state1 = env.reset()
        action1 = choose_action(state1, Q) 


        while t < maxSteps: 
            #Getting the next state 
            state2, reward, done, info = env.step(action1) 

            #Choosing the next action 
            action2 = choose_action(state2, Q) 
            
            #Learning the Q-value 
            predict = Q[state1, action1] 
            target = reward + gamma * Q[state2, action2] 
            Q[state1, action1] = Q[state1, action1] + alpha * (target - predict)

            state1 = state2 
            action1 = action2 

            #Updating the respective vaLues 
            t += 1
            reward += rewardSize

            #If at the end of learning process 
            if done: 
                break

    return Q
SARSA_0()

  action1 = choose_action(state1, Q)
  action2 = choose_action(state2, Q)


array([[-18.63634439, -18.66231957, -17.7394141 , -17.52124722],
       [-18.66486933, -17.95019646, -18.34405074, -18.56481229],
       [-18.57295776, -18.33697067, -17.76028073, -18.59881721],
       [-18.69490531, -18.58561998, -18.33855152, -18.50015535],
       [-18.54511456, -17.11211974, -18.53469865, -18.6088891 ],
       [-17.66784954, -17.31736788, -18.08484055, -18.54456387],
       [-17.64174285, -17.5491478 , -16.65484569, -18.12914468],
       [-17.53999577, -14.82364745, -17.75473842, -16.76915042],
       [-13.77701632, -15.8570239 , -13.48483142, -17.68957685],
       [-17.32282071, -13.79366307, -10.94864536, -16.22185247],
       [-18.23736134, -18.52720879, -17.66632354, -17.92425355],
       [-18.50386551, -18.36199495, -18.58138025, -18.45141088],
       [-18.61884328, -17.74119842, -18.14032902, -18.09073347],
       [-18.53101441, -17.86059127, -18.31546796, -18.70789389],
       [-17.98603628, -17.09812781, -17.74893768, -18.19343595],
       [-16.97639474, -16

## SARSA(n)

In [14]:
#Function to learn the Q-value 
def SARSA_update(state, state_n, reward, action, action_n): 
    predict = Q[state, action] 
    target = reward + gamma * Q[state2, action2] 
    
    Q[state, action] = Q[state, action] + alpha * (target - predict) 
    
def SARSA_update(state1, state_n, n, action, action_n):
    predict = Q[state1, action1]
    target = gamma**n * Q[state_n, action_n]
    n -= 1
    for i in range(n):
        state_new, reward, done, info = env.step(action1)
        action_new = choose_action(state_new)
        action1 = action_new
        target += gamma**i * reward
        
    Q[state, action] = Q[state, action] + alpha * (target - predict) 

![title](https://miro.medium.com/max/1400/1*-v5wbqLYCvzrOQE2Zmv05g.png)

In [15]:
def SARSA_n(n):
    Q = np.zeros((env.observation_space.n, env.action_space.n)) 
    #Initializing the reward  
    total_reward=0
    max_reward = 0

    # Starting the SARSA learning 
    for episode in range(episodeNum): 
        t = 0
        tau = 0
        T = sys.maxsize
        stored_actions = {}
        stored_rewards = {}
        stored_states = {}
        reward_episode = 0
        
        stored_states[0] = state = env.reset()
        stored_actions[0] = choose_action(state, Q)
        
        
        while (tau < T - 1):
            if (t < T):
                action_t = choose_action(state=stored_states[t % n], Q=Q)
                stored_states[(t+1) % n], stored_rewards[(t+1) % n], done, info = env.step(action_t) 
                state_t = stored_states[(t+1) % n]
                
                reward_episode += stored_rewards[(t+1) % n]
                total_reward += stored_rewards[(t+1) % n]
                
                if (done):
                    T = t + 1
                else:
                    stored_actions[(t+1) % n] = choose_action(stored_states[t % n], Q)
                
                    
            tau = t - n + 1
            G = 0
            if (tau >= 0):
                for i in range(tau+1, min(tau+n, T)+1):
                    G += gamma ** (i - tau - 1) * stored_rewards[i % n] 
                if (tau + n < T):
                    G += gamma ** n * Q[stored_states[(tau + n) % n]][stored_actions[(tau + n) % n]]
                current_Q = Q[stored_states[tau % n]][stored_actions[tau % n]]
                Q[stored_states[tau % n]][stored_actions[tau % n]] += alpha * (G - current_Q)
                
            t += 1
                
        max_reward = max(reward_episode, max_reward)
        
    return Q, max_reward, total_reward/episodeNum
                    

In [24]:
Q, max_reward, average_reward = SARSA_n(10)

  stored_actions[0] = choose_action(state, Q)
  action_t = choose_action(state=stored_states[t % n], Q=Q)
  stored_actions[(t+1) % n] = choose_action(stored_states[t % n], Q)


In [25]:
print(Q)

[[-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -19.99999999 -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-19.99999997 -20.         -20.         -19.99999996]
 [-19.53165902 -16.22968511 -18.36298961   0.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.    

## Q-leanring

![title](https://www.cse.unsw.edu.au/~cs9417ml/RL1/images/qalg.gif)

In [17]:
def Q_learning():
    Q = np.zeros((env.observation_space.n, env.action_space.n)) 
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
    for episde in range(episodeNum):
        state = env.reset()
        t = 0
        while (t < maxSteps):
            action_probs = policy(state)
            # choose action according to  
            # the probability distribution 
            action = np.random.choice(np.arange(len(action_probs)), p = action_probs) 
            
            next_state, reward, done, info = env.step(action)
            
            next_best_action = np.argmax(Q[next_state, :])
            target = reward + gamma * Q[next_state, next_best_action]
            Q[state, action] += alpha * (target - Q[state, action])
            
            if (done):
                break
            
            state = next_state
            t += 1

## n-step Q-Learning

In [18]:
def Q_learning_n(n: int):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
    #Initializing the reward  
    total_reward=0
    max_reward = 0

    # Starting the SARSA learning 
    for episode in range(episodeNum): 
        t = 0
        tau = 0
        T = sys.maxsize
        stored_actions = {}
        stored_rewards = {}
        stored_states = {}
        reward_episode = 0
        
        stored_states[0] = state = env.reset()
        stored_actions[0] = choose_action(state, Q)
        
        
        while (tau < T - 1):
            if (t < T):
                action_probs = policy(stored_states[t % n])
                action_t = np.random.choice(np.arange(len(action_probs)), p = action_probs) 
            
                
                stored_states[(t+1) % n], stored_rewards[(t+1) % n], done, info = env.step(action_t) 
                state_t = stored_states[(t+1) % n]
                
                reward_episode += stored_rewards[(t+1) % n]
                total_reward += stored_rewards[(t+1) % n]
                
                if (done):
                    T = t + 1
                else:
                    stored_actions[(t+1) % n] = np.argmax(Q[state_t, :])
                
                    
            tau = t - n + 1
            G = 0
            if (tau >= 0):
                for i in range(tau+1, min(tau+n, T)+1):
                    G += gamma ** (i - tau - 1) * stored_rewards[i % n] 

                if (tau + n < T):
                    G += gamma ** n * Q[stored_states[(tau + n) % n]][stored_actions[(tau + n) % n]]
                current_Q = Q[stored_states[tau % n]][stored_actions[tau % n]]
                Q[stored_states[tau % n]][stored_actions[tau % n]] += alpha * (G - current_Q)
                
            t += 1
                
        max_reward = max(reward_episode, max_reward)
        
    return Q, max_reward, total_reward/episodeNum

In [19]:
Q, max_reward, average_reward = Q_learning_n(10)

  stored_actions[0] = choose_action(state, Q)


In [20]:
print(Q)

[[-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-20.         -20.         -20.         -20.        ]
 [-19.99999925 -19.99999887 -19.99999887 -19.99999887]
 [-19.53165902 -19.28923628 -19.28923628 -19.28923628]
 [-20.         -20.         -20.         -20.        ]
 [-20.    