[View in Colaboratory](https://colab.research.google.com/github/AvivSham/Reinforcement-Learning/blob/master/Solving_FrozenLake_using_Value&Policy_Iterations.ipynb)

#Solving FrozenLake Env using Value and Policy Iteration Algorithms
####You can import this code in order to solve other Envs in OpenAI gym - do not forget to change the name of the environment!!!
---

In [0]:
#@title Installing OpenAI gym & importing dependencies
!pip install gym
from time import time
import numpy as np
import gym

In [0]:
#@title Evaluate the policy efficiency
def runPolicy(env, policy):
  
  # Initializing
  state = env.reset()
  done = False
  totalReward = 0
  
  while not done:
    state, reward, done, _ = env.step(policy[state])
    totalReward += reward
  
  return totalReward

def evaluatePolicy(env, policy, iterations):
  totalRewards = 0
  for i in range(iterations):
    totalRewards += runPolicy(env, policy)
  return totalRewards / iterations

In [0]:
#@title Common functions for value iteration and policy iteration - calculating both policies and state values
eps = 1e-10

def constructGridPolicy(env, values, gamma):
  policy = np.zeros(env.env.nS)
  for s in range(env.env.nS):
    returns = [sum(p * (r + gamma * values[ns])
                  for p, ns, r, _ in env.env.P[s][a])
               for a in range (env.env.nA)
    ]
    policy[s] = np.argmax(returns)
  
  return policy
    

def computeStateValues(env, gamma, policy = None, selectBest = False):
  if policy is None and not selectBest:
    raise 'When running computeStateValues specifying policy or selectBest = True is necessary'
  if policy is not None and selectBest:
    raise 'You cannot use policy and selectBest at the same time'
  
  values = np.zeros(env.env.nS)
  while True:
    nextValues = values.copy()
    for s in range (env.env.nS):
      if policy is not None:
        action = policy[s]
        # Bellman equation
        nextValues[s] = sum(p * (r + gamma * values[ns]) for p, ns, r, _ in env.env.P[s][action])
      else:
        # Bellman equation
        nextValues[s] = max(sum(p * (r + gamma * values[ns])
                               for p, ns, r, _ in env.env.P[s][a])
                            for a in range (env.env.nA)
                           )
    diff = np.fabs (nextValues - values).sum()
    values = nextValues
    if diff <= eps:
      break
     
  return values

In [0]:
#@title Value-iteration algorithm implementation
def valueIteration (env, gamma):
  stateValues = computeStateValues(env, gamma, selectBest=True)
  policy = constructGridPolicy(env, stateValues, gamma)
  return policy

In [0]:
#@title Value-iteration algorithm implementation

# Initializing the policy grid
def initPolicy(env):
  return np.random.choice(env.env.nA, size=(env.env.nS))

# Policy update every iteration
def policyIteration(env, gamma):
  policy = initPolicy(env)
  while True:
    stateValues = computeStateValues(env, gamma, selectBest = True)
    nextPolicy = constructGridPolicy (env, stateValues, gamma)
    if np.all(policy == nextPolicy):
      break
      
    policy = nextPolicy
    
  return policy

In [0]:
#@title Environment solving function
def solveEnv (env, methods, envName):
  print(f'Solving environment {envName}')
  for method in methods:
    name, f, gamma = method
    startTime = time()
    policy = f(env, gamma)
    endTime = time()
    print(f'It took {endTime - startTime} to train the policy with {name} algorithm , Gamma = {gamma}')
    
    
    policyScore = evaluatePolicy(env, policy, evaluateIterations)
    print(f'The averaged policy reward is: {policyScore}')

In [19]:
#@title Main code - solving FrozenLake 4x4

# You can import the code and use the functions without the main code part

if __name__ == '__main__':


  
  evaluateIterations = 1000
  methods = [
  ('Value Iteration', valueIteration, 0.9),
  ('Policy Iteration', policyIteration, 0.9),
  ('Value Iteration', valueIteration, 0.98),
  ('Policy Iteration', policyIteration, 0.98),
  ('Value Iteration', valueIteration, 1),
  ('Policy Iteration', policyIteration, 1),
]

frozenLake4x4 = gym.make('FrozenLake-v0')
solveEnv(frozenLake4x4, methods, 'Frozen Lake 4x4')

Solving environment Frozen Lake 4x4
It took 0.025897502899169922 to train the policy with Value Iteration algorithm , Gamma = 0.9
The averaged policy reward is: 0.749
It took 0.05134749412536621 to train the policy with Policy Iteration algorithm , Gamma = 0.9
The averaged policy reward is: 0.754
It took 0.07741785049438477 to train the policy with Value Iteration algorithm , Gamma = 0.98
The averaged policy reward is: 0.741
It took 0.14547014236450195 to train the policy with Policy Iteration algorithm , Gamma = 0.98
The averaged policy reward is: 0.749
It took 0.15343689918518066 to train the policy with Value Iteration algorithm , Gamma = 1
The averaged policy reward is: 0.723
It took 0.3515298366546631 to train the policy with Policy Iteration algorithm , Gamma = 1
The averaged policy reward is: 0.741


In [0]:
#@title Compare to FrozenLake 8x8

frozenLake8x8 = gym.make('FrozenLake8x8-v0')
solveEnv(FrozenLake8x8, methods, 'Frozen Lake 8x8')