In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym

#### Creating a class for Agent

In [2]:
class Agent(object):

#Initializing the agent's parameters
    def __init__(self,env,gamma=0.98,alpha=0.5):
        self.env=env
        
        #Number of possible states
        self.state_size=env.observation_space.n
        
        #Number of possible actions
        self.action_size=env.action_space.n 
        
        #Discount factor to take future rewards into acount
        self.gamma=gamma
        
        #Learning rate, to update the Q-table
        self.alpha=alpha
        
        #Defining the array of the possible actions the agent can take
        self.action_space=np.arange(self.action_size)
        
        #Initializing the Q-Table
        self.qtable=np.zeros((self.state_size,self.action_size))
        
        #To maintain track of average reward per episode while training        
        self.reward_history=[]
        
        #To keep track of the length od episodes
        self.episode_lengths=[]

#### Epsilon Greedy Policy

![image.png](attachment:image.png)

In [3]:
#Using the epsilon greedy policy to choose the most appropriate action from the action space
def epsilon_greedy(self,state,epsilon=0.2):
    q_values = self.qtable[state,:]
    A = np.zeros((self.action_size)) + epsilon/self.action_size
    greedy_action = np.argmax(q_values)
    A[greedy_action] += 1-epsilon
    return np.random.choice(self.action_space,p=A)

#### Discounted Rewards

In [4]:
#discount rewards of an entire episode
def discounted_rewards(self,rewards):
#Rewards is a 1-D array with the stored rewards obtained at each timestep of an episode
    current_reward=0
    discounted_rewards=np.zeros((len(rewards)))
    for t in reversed(range(len(rewards))):
        current_reward = self.gamma*current_reward + rewards[t]
        discounted_rewards[t]=current_reward
    return discounted_rewards

#### Updating Q_Table

In [5]:
#Updating the qtable for each new state vsited (for the first time) in each episode
def update_qtable(self,state,action,reward_discounted):
    self.qtable[state,action]+=self.alpha*(reward_discounted-self.qtable[state,action])

#### Defining Monte Carlo Method

![MonteCarlo.PNG](attachment:MonteCarlo.PNG)

In [6]:
def monte_carlo_episode(self): 
    episode=[]
#Starting from the initial start
    state_now=self.env.reset()
    while True:
    #Choosing the action as per the constant epsioln greedy policy
        action=self.epsilon_greedy(state_now)
        state_next,reward,done,_=self.env.step(action)
        episode.append([state_now,action,reward])
        state_now=state_next
        #If the goal is reached, terminate the episode, otherwise go to the next state
        if done==True:
            break
        #Returning the buffer containing the information for each timestep of the episode
        return np.array(episode)


#### Function to Train the Agent 

In [7]:
#Training the agent through a series of episodes
def train(self,num_episodes=10000):
    #Iterating over a number of episodes
    for i in range(num_episodes):
        
        env.render()
        #Generating the episode
        episode=self.monte_carlo_episode()
        #Keeping track of the length of the episode
        self.episode_lengths.append(len(episode[:,0]))
        #Storing the reards for the episode, and discounting it
        rewards=episode[:,2].copy()
        rewards=self.discounted_rewards(rewards)
        #Updating the particular qvalues only
        for k in range(len(episode[:,0])):
            self.update_qtable(int(episode[k,0]),int(episode[k,1]),float(rewards[k]))
            self.reward_history.append(np.mean(episode[:,2]))
            if (i+1)%100==0:
                print("Average reward claimed by the agent in episode {} : {}".format(i+1,self.reward_history[-1]))
                print("Length of episode {} : {}".format(i+1,self.episode_lengths[-1]))

## Comibinig all these methods

In [8]:
class Agent(object):
    
    def __init__(self,env,gamma=0.98,alpha=0.5):
        self.env=env
        self.state_size=env.observation_space.n 
        self.action_size=env.action_space.n
        self.gamma=gamma
        self.alpha=alpha
        self.action_space=np.arange(self.action_size)
        self.qtable=np.zeros((self.state_size,self.action_size))
        self.reward_history=[]
        self.episode_lengths=[]


    def epsilon_greedy(self,state,epsilon=0.2):
        qvalues=self.qtable[state,:]
        A=np.zeros((self.action_size)) + epsilon/self.action_size
        greedy_action=np.argmax(qvalues)
        A[greedy_action]+=1-epsilon
        return np.random.choice(self.action_space,p=A)

    
    def discounted_rewards(self,rewards):
        current_reward=0
        discounted_rewards=np.zeros((len(rewards)))
        for t in reversed(range(len(rewards))):
            current_reward = self.gamma*current_reward + rewards[t]
            discounted_rewards[t]=current_reward
        return discounted_rewards

    def update_qtable(self,state,action,reward_discounted):
        self.qtable[state,action]+=self.alpha*(reward_discounted-self.qtable[state,action])

        
    def monte_carlo_episode(self):
        episode=[]
        state_now=self.env.reset()
        while True:
            action=self.epsilon_greedy(state_now)
            state_next,reward,done,_=self.env.step(action)
            episode.append([state_now,action,reward])
            state_now=state_next
            if done==True:
                break
        return np.array(episode)

    
    def train(self,num_episodes=10000):
        for i in range(num_episodes):
            env.render()
            episode=self.monte_carlo_episode()
            self.episode_lengths.append(len(episode[:,0]))
            rewards=episode[:,2].copy()
            rewards=self.discounted_rewards(rewards)
            for k in range(len(episode[:,0])):
                self.update_qtable(int(episode[k,0]),int(episode[k,1]),float(rewards[k]))
            self.reward_history.append(np.mean(episode[:,2]))
            if (i+1)%100==0:
                print("Average reward claimed by the agent in episode {} : {}".format(i+1,self.reward_history[-1]))
                print("Length of episode {} : {}".format(i+1,self.episode_lengths[-1]))


In [9]:
env=gym.make("FrozenLake-v1")
agent=Agent(env=env)
agent.train()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FF

SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0m

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
Average reward claimed by the agent in episode 1100 : 0.0
Length of episode 1100 : 3
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFF

SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFF[41mH[0m
HFFG
Average reward claimed by the agent in episode 1400 : 0.0
Length of episode 1400 : 49
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F

SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Average reward claimed by the agent in episode 1700 : 0.0
Length of episode 1700 : 21
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)


SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFF

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHF

SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Average reward claimed by the agent in episode 3300 : 0.0
Length of episode 3300 : 19
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SF

SFFF
F[41mH[0mFH
FFFH
HFFG
Average reward claimed by the agent in episode 3600 : 0.0
Length of episode 3600 : 4
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Righ

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFF

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
Average reward claimed by the agent in episode 4900 : 0.0
Length of episode 4900 : 5
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HF

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF

SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
Average reward claimed by the agent in episode 5500 : 0.0
Length of episode 5500 : 13
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  

SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (

SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFF

Average reward claimed by the agent in episode 6300 : 0.0
Length of episode 6300 : 17
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
 

SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
Average reward claimed by the agent in episode 7600 : 0.0
Length of episode 7600 : 9
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
S

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
Average reward claimed by the agent in episode 7900 : 0.0
Length of episode 7900 : 2
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)


Average reward claimed by the agent in episode 8200 : 0.0
Length of episode 8200 : 8
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HF

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
Average reward claimed by the agent in episode 8500 : 0.0
Length of episode 8500 : 7
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHFH


SFFF
FHFH
FFFH
HFF[41mG[0m
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFF

SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0

SFFF
FHF[41mH[0m
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Average reward claimed by the agent in episode 9800 : 0.0
Length of episode 9800 : 10
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Left)
S