In [2]:
#Library for environments
import gym
from gym.envs.registration import register
from gym.envs.toy_text.frozen_lake import generate_random_map

#Librairies to represent the output
from IPython.display import clear_output
import time
import matplotlib.pyplot as plt

#Essential libraries for computation
import numpy as np
import random
import tensorflow.compat.v1 as tf
from collections import deque

In [3]:
# TensorFlow config to GPU
print(tf.__version__)

gpus = tf.config.list_physical_devices('GPU')
if gpus: 
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=15292)]
    )

logical_gpus = tf.config.list_logical_devices('GPU')
print(logical_gpus)
print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")


from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

print()
print()

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.6.0
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
1 Physical GPU, 1 Logical GPUs
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7179976304974824060
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 16034824192
locality {
  bus_id: 1
  links {
  }
}
incarnation: 3933162300187828700
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5"
]


Num GPUs Available:  1


In [19]:
random_map = generate_random_map(size=8, p=0.3)
environment = gym.make("FrozenLake-v1", is_slippery=False, desc=random_map)
environment.reset()
environment.render()


[41mS[0mFFHHHHF
FHFHHHHH
HHFFHFHH
HHHFHFFF
HHHFFFHH
HHHFHHHH
FHHFFHHF
HHHHFFFG


In [20]:
gamma = 0.99 # discount factor
theta = 0.000001 # threshold for convergence

def argmax(env, V, pi, action,s, gamma):
    e = np.zeros(env.env.nA)
    for a in range(env.env.nA):                         # iterate for every action possible 
        q=0
        P = np.array(env.env.P[s][a])                   
        (x,y) = np.shape(P)                             # for Bellman Equation 
        
        for i in range(x):                              # iterate for every possible states
            s_= int(P[i][1])                            # S' - Sprime - possible succesor states
            p = P[i][0]                                 # Transition Probability P(s'|s,a) 
            r = P[i][2]                                 # Reward
            
            q += p*(r+gamma*V[s_])                      # calculate action_ value q(s|a)
            e[a] = q
            
    m = np.argmax(e) 
    action[s]=m                                           # Take index which has maximum value 
    pi[s][m] = 1                                        # update pi(a|s) 

    return pi

In [21]:
def bellman_optimality_update(env, V, s, gamma):  # update the stae_value V[s] by taking 
    pi = np.zeros((env.env.nS, env.env.nA))       # action which maximizes current value
    e = np.zeros(env.env.nA)                       
                                            # STEP1: Find 
    for a in range(env.env.nA):             
        q=0                                 # iterate for all possible action
        P = np.array(env.env.P[s][a])
        (x,y) = np.shape(P)
        
        for i in range(x):
            s_= int(P[i][1])
            p = P[i][0]
            r = P[i][2]
            q += p*(r+gamma*V[s_])
            e[a] = q
            
    m = np.argmax(e)
    pi[s][m] = 1
    
    value = 0
    for a in range(env.env.nA):
        u = 0
        P = np.array(env.env.P[s][a])
        (x,y) = np.shape(P)
        for i in range(x):
            
            s_= int(P[i][1])
            p = P[i][0]
            r = P[i][2]
            
            u += p*(r+gamma*V[s_])
            
        value += pi[s,a] * u
  
    V[s]=value
    return V[s]

In [22]:
def value_iteration(env, gamma, theta):
    V = np.zeros(env.env.nS)                                       # initialize v(0) to arbitory value, my case "zeros"
    while True:
        delta = 0
        for s in range(env.env.nS):                       # iterate for all states
            v = V[s]
            bellman_optimality_update(env, V, s, gamma)   # update state_value with bellman_optimality_update
            delta = max(delta, abs(v - V[s]))             # assign the change in value per iteration to delta  
        if delta < theta:                                       
            break                                         # if change gets to negligible 
                                                          # --> converged to optimal value         
    pi = np.zeros((env.env.nS, env.env.nA)) 
    action = np.zeros((env.env.nS))
    for s in range(env.env.nS):
        pi = argmax(env, V, pi,action, s, gamma)         # extract optimal policy using action value 
        
    return V, pi,action                                          # optimal value funtion, optimal policy

In [23]:
V, pi, action = value_iteration(environment, gamma, theta)
V

array([0.87752102, 0.88638487, 0.89533825, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.86874581, 0.        ,
       0.90438208, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.91351725, 0.92274469,
       0.        , 0.90438208, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.93206535, 0.        , 0.91351725,
       0.90438208, 0.89533825, 0.        , 0.        , 0.        ,
       0.94148015, 0.93206535, 0.92274469, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.95099005, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.96059601, 0.970299  , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.9801    , 0.99      , 1.        , 0.        ])

In [24]:
a= np.reshape(action,(8,8))
print(a)                          # discrete action to take in given state

[[2. 2. 1. 0. 0. 0. 0. 0.]
 [3. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 2. 1. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 2. 1. 0. 0. 1.]
 [0. 0. 0. 0. 2. 2. 2. 0.]]


In [38]:
e=0
for i_episode in range(100):
    c = environment.reset()
    reward = 0
    done = False
    for t in range(10000):
        c, reward, done, info = environment.step(action[c])
        # print(reward)
        if done:
            if reward == 1:
                e +=1
                reward = 0
                done = False
            break
print(e)
print(" agent succeeded to reach goal {} out of 100 Episodes using this policy ".format(e+1))
environment.close()

100
 agent succeeded to reach goal 101 out of 100 Episodes using this policy 
