# Set up

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('CliffWalking-v0')

In [4]:
#set the start state
state = env.reset()
#and take some random actions
for i in range(5):
  #render the environment
  print(f"Current state, {state}")
  env.render()
  #select a random action
  action = env.action_space.sample()
  #take a step and record next state, reward and termination
  state, reward, done, _ = env.step(action)
  print("Action: {}".format(action))
  print("Reward: {}".format(reward))
  print("************************************")
  if done:
    #this environment only terminates once the goal is reached
    print("Done.")
    break

Current state, 36
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Action: 3
Reward: -1
************************************
Current state, 36
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Action: 0
Reward: -1
************************************
Current state, 24
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

Action: 1
Reward: -1
************************************
Current state, 25
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  x  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

Action: 2
Reward: -100
************************************
Current state, 36
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C

# Defining an agent

The next step is to define a class for our agents. We will derive from this class to later implement a Value Iteration, Policy Iteration and Monte Carlo control agent. The base class will only provide simple functionality.

In [None]:
class Agent :
  def __init__(self,env,discount_factor):
    self.env = env
    self.gamma = discount_factor
  
  def act(self, state):
    return self.env.action_space.sample() #returns a random action

  def evaluate(self):
    # now let's test our random action agent
    n_steps = 1000 #number of steps per episode

    s = env.reset()
    episode_reward = 0
    
    for i in range(n_steps):
      s, r, d, _ = env.step(self.act(s))
      episode_reward += r
      if done:
        break
    return episode_reward

#test simple evaluation function
random_agent = Agent(env,0.99)
episode_reward = random_agent.evaluate()
print("Episode return {}".format(episode_reward))

Episode return -9514


# Value Iteration Agent

In this section you are to implement an agent that solves the environment, using Value Iteration

In [13]:
stateValue = [0 for i in range(env.nS)]
len(stateValue)

48

In [None]:
class ValueAgent(Agent):
  def __init__(self,env,discount_factor,theta):
    super().__init__(env,discount_factor)
    #theta is an approximation error threshold
    self.theta = theta
    self.V = np.random.rand(self.env.observation_space.n)
    #set terminal state to 0
    self.V[-1] = 0 

  def update_V(self, V, state):
      
      for i in range(len(self.env.P[state][action])):
            prob, next_state, reward, done = self.env.P[state][action][i]
            state_action_value = prob * (reward + self.gamma*stateValue[next_state])
            state_value += state_action_value
          action_values.append(state_value)      
          best_action = np.argmax(np.asarray(action_values))   
          newStateValue[state] = action_values[best_action]  




      V[-1] = 0
      return V

  def learning(env, max_iterations=100000):
    for i in range(max_iterations):
      for state in range(self.env.nS)

          

      #condition for the loop to continue    
      if sum(stateValue) - sum(newStateValue) < theta:  
        break
      else:
        stateValue = newStateValue.copy()

    return stateValue 

  #define best policy
  def get_policy(env,stateValue, lmbda=0.9):
    policy = [0 for i in range(env.nS)]
    for state in range(env.nS):
      action_values = []
      for action in range(env.nA):
        action_value = 0
        for i in range(len(env.P[state][action])):
          prob, next_state, r, _ = env.P[state][action][i]
          action_value += prob * (r + lmbda * stateValue[next_state])
        action_values.append(action_value)
      best_action = np.argmax(np.asarray(action_values))
      policy[state] = best_action
    return policy 

  #evaluate best policy
  def get_score(env, policy, episodes=1000):
    misses = 0
    steps_list = []
    for episode in range(episodes):
      observation = env.reset()
      steps=0
      while True:
        
        action = policy[observation]
        observation, reward, done, _ = env.step(action)
        steps+=1
        if done and reward == 1:
          # print('You have got the fucking Frisbee after {} steps'.format(steps))
          steps_list.append(steps)
          break
        elif done and reward == 0:
          # print("You fell in a hole!")
          misses += 1
          break

In [None]:
#evaluation
value_agent = ValueAgent(env,0.99,0.001)
stateValue = value_iteration(env, max_iterations=100000, lmbda=0.9)
policy = get_policy(env,stateValue, lmbda=0.9)
steps_list, misses =  get_score(env, policy, episodes=10)

#print('You took an average of {:.0f} steps to get the frisbee'.format(np.mean(steps_list)))
#print('And you fell in the hole {:.2f} % of the times'.format((misses/episodes) * 100))

In [None]:
#from group discussion

class ValueAgent(Agent):
  def __init__(self,env,discount_factor,theta):
    super().__init__(env,discount_factor)
    #theta is an approximation error threshold
    self.theta = theta
    self.V = np.random.rand(self.env.observation_space.n)
    #set terminal state to 0
    self.V[-1] = 0 

  def act(self, state): 
    #here choose action that would bring us to state with highest value
    # Select the action that has highest expected value
    #no action choice in the terminal state
    values=[]
    for i in range(self.env.nA):
      _,next_state,_,_ = env.P[state][i][0]
      values.append(self.V[next_state])

    action = np.argmax(values)
    #print(action)
    return action

  def iterate(self):
    while(True):
    #for i in range(5):
      #print(self.V) 
      delta = 0.0
      for state in range(self.env.nS):
        v = self.V[state]
        action = self.act(state)
        prob, next_state, reward, done = self.env.P[state][action][0]
        if  not done:
          self.V[state] = prob * (reward + self.gamma*self.V[next_state])
        delta = max([delta, np.abs(v-self.V[state])])
      print(delta)
      if (delta < self.theta):
       print(delta)
       break

agent = ValueAgent(env,0.99,0.001)
#perform value iteration
agent.iterate()
#evaluate agent and plot relevant qualities
episode_reward=agent.evaluate()
print("Episode return {}".format(episode_reward))

# Policy Iteration Agent
Follow the same procedure for implementing a policy iteration agent

In [None]:
#code here

#Monte Carlo control agent
Follow the same procedure for implementing a Monte Carlo control agent

In [None]:
#code here

# Change the environment to be non-deterministic

