

> Import libraries to use



In [1]:
import numpy as np

>  # Introduction to numpy (Skip if you already are familiar)

>> Creating a 1D array

In [2]:
a = np.array([1,2,3,4])
print(a)

[1 2 3 4]


>> Creating a 2D array


In [3]:
a = np.array([[1,2],[3,4]])
print(a)

[[1 2]
 [3 4]]


>> Creating an array full of zeros


In [4]:
a = np.zeros(shape=(10))
print(a)
a = np.zeros(shape=(5,2))
print(a)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


>> Infinity in numpy

In [5]:
print(np.inf)

inf


>> Max and Argmax

In [6]:
a = np.array([2,1,4,3])
print(np.max(a))
print(np.argmax(a))

4
2


>> From list to Numpy

In [7]:
l = [1,2,3,4]
print(l)
print(np.asarray(l))

[1, 2, 3, 4]
[1 2 3 4]


>> Random in numpy

In [8]:
# Array of Random integers ranging from 1 to 10 (with any size you want)
a = np.random.randint(low=1, high=10, size=(5,2))
print(a)

# Array of random elements of a list with any size you want
a = np.random.choice([0,1,2], size=(2,))

[[4 9]
 [8 8]
 [9 6]
 [4 7]
 [9 3]]


>> Shapes in numpy

In [9]:
a = np.random.randint(low=1, high=5, size=(4,2))
print(a.shape)
print(a)

# Reshape a to a vector of shape = (8,1)
a = a.reshape((8,1))
print(a.shape)
print(a)

(4, 2)
[[4 1]
 [2 3]
 [4 2]
 [4 3]]
(8, 1)
[[4]
 [1]
 [2]
 [3]
 [4]
 [2]
 [4]
 [3]]


# Pre-defined utilities

In [10]:

int_to_char = {
    0 : 'u',
    1 : 'r',
    2 : 'd',
    3 : 'l'
}

policy_one_step_look_ahead = {
    0 : [-1,0],
    1 : [0,1],
    2 : [1,0],
    3 : [0,-1]
}

def policy_int_to_char(pi,n):

    pi_char = ['']

    for i in range(n):
        for j in range(n):

            if i == 0 and j == 0 or i == n-1 and j == n-1:

                continue

            pi_char.append(int_to_char[pi[i,j]])

    pi_char.append('')

    return np.asarray(pi_char).reshape(n,n)

# 1- Policy evaluation

In [11]:
def policy_evaluation(n, pi, v, Gamma, threshhold):
    new_values = v.copy()  # Crée une copie des valeurs existantes

    for row in range(n):
        for col in range(n):
            # Ignorer les états terminaux
            if (row == 0 and col == 0) or (row == n - 1 and col == n - 1):
                continue

            current_value = v[row, col]
            action = pi[row, col]

            # Calcul des indices du prochain état
            next_row = row + policy_one_step_look_ahead[action][0]
            next_col = col + policy_one_step_look_ahead[action][1]

            # Vérification des limites pour éviter les dépassements
            if next_row < 0 or next_row >= n or next_col < 0 or next_col >= n:
                next_row, next_col = row, col

            # Calcul de la nouvelle valeur pour l'état actuel
            reward = -1
            new_values[row, col] = reward + Gamma * v[next_row, next_col]

    return new_values  # Retourne la nouvelle fonction de valeur


# 2- Policy improvement

In [12]:
def policy_improvement(n, pi, v, Gamma):
    """Update the policy to be greedy with respect to the current value function."""
    print("policy impr")
    policy_stable = True
    new_pi = np.copy(pi)
    for i in range(n):
        for j in range(n):
            if pi[i, j] == -1:
                continue  # skip terminal states
            action_values = []
            for action in range(4):
                di, dj = policy_one_step_look_ahead[action]
                next_i = max(0, min(i + di, n - 1))
                next_j = max(0, min(j + dj, n - 1))
                reward = -1
                action_value = reward + Gamma * v[next_i, next_j]
                action_values.append(action_value)
            best_action = np.argmax(action_values)
            if best_action != pi[i, j]:
                policy_stable = False
                new_pi[i, j] = best_action
    return new_pi, policy_stable

# 3- Policy Initialization

In [13]:
def policy_initialization(n):
    print("init pi")
    """Initialize a random policy for all states except terminal states."""
    pi = np.random.choice([0, 1, 2, 3], size=(n, n))
    pi[0, 0] = -1  # terminal state
    pi[n-1, n-1] = -1  # terminal state
    return pi

# 4- Policy Iteration algorithm

In [14]:
def policy_iteration(n, Gamma, threshhold):
    pi = policy_initialization(n=n)
    v = np.zeros((n, n))
    while True:
        print("iteration pi",pi)
        v = policy_evaluation(n=n, pi=pi, v=v, Gamma=Gamma, threshhold=threshhold)
        pi, policy_stable = policy_improvement(n=n, pi=pi, v=v, Gamma=Gamma)
        if policy_stable:
            break
    return pi, v

# Main Code to Test

In [15]:
n = 4

Gamma = [0.8,0.9,1]

threshhold = 1e-4

for _gamma in Gamma:
    pi, v = policy_iteration(n=n, Gamma=_gamma, threshhold=threshhold)
    pi_char = policy_int_to_char(pi=pi, n=n)
    print(f"\nGamma = {_gamma}\n")
    print(pi_char)
    print("\n", v)





init pi
iteration pi [[-1  0  0  0]
 [ 1  0  0  2]
 [ 3  3  3  1]
 [ 3  2  2 -1]]
policy impr
iteration pi [[-1  3  0  0]
 [ 0  0  0  0]
 [ 0  0  0  2]
 [ 0  0  1 -1]]
policy impr
iteration pi [[-1  3  3  0]
 [ 0  0  0  2]
 [ 0  0  1  2]
 [ 0  1  1 -1]]
policy impr
iteration pi [[-1  3  3  2]
 [ 0  0  0  2]
 [ 0  0  1  2]
 [ 0  1  1 -1]]
policy impr

Gamma = 0.8

[['' 'l' 'l' 'd']
 ['u' 'u' 'u' 'd']
 ['u' 'u' 'r' 'd']
 ['u' 'r' 'r' '']]

 [[ 0.   -1.   -1.8  -2.44]
 [-1.   -1.8  -2.44 -1.8 ]
 [-1.8  -2.44 -1.8  -1.  ]
 [-2.44 -1.8  -1.    0.  ]]
init pi
iteration pi [[-1  2  1  3]
 [ 2  1  2  3]
 [ 1  3  2  1]
 [ 1  3  2 -1]]
policy impr
iteration pi [[-1  3  0  0]
 [ 0  0  0  0]
 [ 0  0  0  2]
 [ 0  0  1 -1]]
policy impr
iteration pi [[-1  3  3  0]
 [ 0  0  0  2]
 [ 0  0  1  2]
 [ 0  1  1 -1]]
policy impr
iteration pi [[-1  3  3  2]
 [ 0  0  0  2]
 [ 0  0  1  2]
 [ 0  1  1 -1]]
policy impr

Gamma = 0.9

[['' 'l' 'l' 'd']
 ['u' 'u' 'u' 'd']
 ['u' 'u' 'r' 'd']
 ['u' 'r' 'r' '']]

 [[ 0.