In [67]:
import numpy as np
import matplotlib.pyplot as plt
import random 

class GridWorld(object):
    def __init__(self, gridSize, items):
        self.step_reward = -0.02 #chaque pas diminue la reward
        self.m = gridSize[0] #init ligne
        self.n = gridSize[1] #init colone
        self.grid = np.zeros(gridSize) #grille de 0
        self.items = items #ajout des différents items

        self.state_space = list(range(self.m * self.n)) #tout les état possible
        self.state_space.remove(5) #on retire l'état block

        self.action_space = {'U': -self.m, 'D': self.m, 'L': -1, 'R': 1} #direction (pas compirs le -self + self)
        self.actions = ['U', 'D', 'L', 'R']#action up down left right

        self.P = self.int_P() #probabilité de chaque déplacement en fonction de l'état actuel
        #format de P : (State actuel, direction) : (state futur, proba)

    def int_P(self):
        P = {}
        for state in self.state_space:
            for action in self.actions:
                reward = self.step_reward #pas en plus
                n_state = state + self.action_space[action] # nouveau état = état actuel + déplacement
                
                if n_state in self.items.get('fire').get('loc'): #si nouveau état = feu
                    reward += self.items.get('fire').get('reward') #alors reward du feu
                elif n_state in self.items.get('water').get('loc'): #si nouveau état = froid
                    reward += self.items.get('water').get('reward') #alors reward du froid
                elif self.check_move(n_state, state):#vérification du nouveau état
                    n_state = state #nouveau état reste l'état

                #cas foireux pas compris
                if state==4 and action=='L':
                    n_state=4
                    reward=-0.02
                if state==7 and action=='L':
                    n_state=7
                    reward=-0.02    
                #on inscrit dans p
                P[(state ,action)] = (n_state, reward)
        return P

    def check_terminal(self, state):
        return state in self.items.get('fire').get('loc') + self.items.get('water').get('loc')

    def check_move(self, n_state, oldState):
        if n_state not in self.state_space:#si état imposssible alors on reste
            return True
        elif oldState % self.m == 0 and n_state % self.m == self.m - 1: #si on sort alors on reste
            return True
        elif oldState % self.m == self.m - 1 and n_state % self.m == 0: #si on sort alors on reste
            return True
        else: #autre cas ok
            return False
        
    def stochasticMove(self,action,p):
        probas = p[action]
        value=[]
        idx=0
        for prob in probas:
            value.append(( idx, np.random.rand() * prob ))
            idx=idx+1
        idxmax, valuemax=max(value)
        
        action= self.actions[idxmax]
        return action
    

def print_v(v, grid):#dessin des values
    v = np.reshape(v, (grid.n, grid.m))

    cmap = plt.cm.get_cmap('Greens', 100) #malin, mapde vert
    norm = plt.Normalize(v.min(), v.max()) #échelle des v 
    rgba = cmap(norm(v)) #on prend les verts du plus clair au foncé

    for w in grid.items.get('water').get('loc'):
        idx = np.unravel_index(w, v.shape)
        rgba[idx] = 0.0, 0.5, 0.8, 1.0

    for f in grid.items.get('fire').get('loc'):
        idx = np.unravel_index(f, v.shape)
        rgba[idx] = 1.0, 0.5, 0.1, 1.0
        
    for f in grid.items.get('block').get('loc'):
        idx = np.unravel_index(f, v.shape)
        rgba[idx] = 0,0,0,0

    fig, ax = plt.subplots()
    im = ax.imshow(rgba, interpolation='nearest')

    for i in range(v.shape[0]):
        for j in range(v.shape[1]):
            c = 'w'
            if v[i, j] < 0.4: c = 'k'#si  la proba est faible on écrit en noir
            if v[i, j] != 0:
                text = ax.text(j, i, np.round(v[i, j], 2), ha="center", va="center", color=c)

    plt.axis('off')
    # plt.savefig('stochastic_v.jpg', bbox_inches='tight', dpi=200)
    plt.show()


def print_policy(v, policy, grid):
    #shape of v et policy of v
    v = np.reshape(v, (grid.n, grid.m))
    policy = np.reshape(policy, (grid.n, grid.m))
    cmap = plt.cm.get_cmap('Greens', 10)
    norm = plt.Normalize(v.min(), v.max())
    rgba = cmap(norm(v))

    for w in grid.items.get('water').get('loc'):
        idx = np.unravel_index(w, v.shape)
        rgba[idx] = 0.0, 0.5, 0.8, 1.0

    for f in grid.items.get('fire').get('loc'):
        idx = np.unravel_index(f, v.shape)
        rgba[idx] = 1.0, 0.5, 0.1, 1.0
        
    for f in grid.items.get('block').get('loc'):
        idx = np.unravel_index(f, v.shape)
        rgba[idx] = 0,0,0,0

    fig, ax = plt.subplots()
    im = ax.imshow(rgba, interpolation='nearest')

    for i in range(v.shape[0]):
        for j in range(v.shape[1]):
            c = 'w'
            if v[i, j] < 0.4: c = 'k'
            if v[i, j] != 0:
                text = ax.text(j, i, policy[i, j], ha="center", va="center", color=c)
    plt.axis('off')
    # plt.savefig('stochastic_policy.jpg', bbox_inches='tight', dpi=200)
    plt.show()
    
def print_q_policy(q,qPolicy, grid):

    #shape of q policy
    qValue=[]
    for state in range(q.shape[0]):
        qValue.append(max(q[state]))
    qValue = np.reshape(qValue, (grid.n, grid.m))
    qPolicy = np.reshape(qPolicy, (grid.n, grid.m))

    cmap = plt.cm.get_cmap('Greens', 10)
    norm = plt.Normalize(qValue.min(), qValue.max())
    rgba = cmap(norm(qValue))

    for w in grid.items.get('water').get('loc'):
        idx = np.unravel_index(w, qValue.shape)
        rgba[idx] = 0.0, 0.5, 0.8, 1.0

    for f in grid.items.get('fire').get('loc'):
        idx = np.unravel_index(f, qValue.shape)
        rgba[idx] = 1.0, 0.5, 0.1, 1.0
        
    for f in grid.items.get('block').get('loc'):
        idx = np.unravel_index(f, qValue.shape)
        rgba[idx] = 0,0,0,0

    fig, ax = plt.subplots()
    im = ax.imshow(rgba, interpolation='nearest')

    for i in range(qValue.shape[0]):
        for j in range(qValue.shape[1]):
            c = 'w'
            if qValue[i, j] < 0.6: c = 'k'
            if qValue[i, j] != 0 :
                text = ax.text(j, i, qPolicy[i, j], ha="center", va="center", color=c)
                
    plt.axis('off')
    # plt.savefig('stochastic_policy.jpg', bbox_inches='tight', dpi=200)
    plt.show()
        


def interate_values(grid, v , policy, gamma, theta, p_stoch):
    converged = False # convergence
    i = 0 #nombre d'itérations
    sp = p_stoch #proba qu'il obéisse à la direction

    #stochastic : il écoutera pas forcément
    p = {'U': [sp,0,(1-sp)/2,(1-sp)/2],
         'D': [0,sp,(1-sp)/2,(1-sp)/2],
         'L': [(1-sp)/2,(1-sp)/2,sp,0],
         'R': [(1-sp)/2,(1-sp)/2,0,sp]}
    
    ########## v : OPTIMAL VALUE ############### 
    while not converged:
        DELTA = 0
        for state in grid.state_space: #pour chaque état
            i += 1
            if  grid.check_terminal(state): # si l'état est final
                v[state] = 0 #value nul car pas de déplacement

            else:
                old_v = v[state] #on récupère V 
                new_v = [] #nouveau déplacements
                for action in grid.actions: #pour chaque action (on dit U)
                    new_v_p = []

                    for idx, action_p in enumerate(grid.actions):# pour chaque action (on dit U il peut faire ULR)
                        (n_state, reward) = grid.P.get((state, action_p)) # nouveau état en fonction de l'action
                        new_v_p.append(p.get(action)[idx] * (reward + (gamma * v[n_state]))) #on note la value en fonction de la reward du nouvel état
                    new_v.append(sum(new_v_p)) #on somme la value pour chaque action 

                v[state] = max(new_v) #on retient la value la plus haute
               
                DELTA = max(DELTA, np.abs(old_v - v[state])) #convergence
                converged = True if DELTA < theta else False
                
    ########## V : OPTIMAL POLICY ###############
    for state in grid.state_space: #pour chaque action donnée
        i += 1
        new_vs = []

        for action in grid.actions: #pour chaque action possible
            (n_state, reward) = grid.P.get((state, action)) #nouveau état et reward
            new_vs.append(reward + gamma * v[n_state]) #on retient la value

        new_vs = np.array(new_vs) 
        best_action_idx = np.where(new_vs == new_vs.max())[0] #on garde la meilleure policy
        policy[state] = grid.actions[best_action_idx[0]]#on note la meilleure policy
     
    print(i, 'iterations of state space')
    return v, policy

def interate_qValues(grid,q,qPolicy,gamma,theta,p_stoch,epsilon,alpha):
    converged = False # convergence
    stopCondition = False # condition d'arret
    action=0
    i = 0 #nombre d'itérations
    sp = p_stoch #proba qu'il obéisse à la direction

    #stochastic : il écoutera pas forcément
    p = {'U': [sp,0,(1-sp)/2,(1-sp)/2],
         'D': [0,sp,(1-sp)/2,(1-sp)/2],
         'L': [(1-sp)/2,(1-sp)/2,sp,0],
         'R': [(1-sp)/2,(1-sp)/2,0,sp]}
    a=0
    ########## Q : OPTIMAL ACTION VALUE ############### 
    for b in range(1000):
        DELTA = 0
        stopCondition = False 
        state = random.choice(grid.state_space)
        while not stopCondition:
            a=a+1    
            if grid.check_terminal(state): # si l'état est final
                stopCondition=True
                q[state]=0
                break
             
            m=random.uniform(0, 1)
            if(m<epsilon):
                idx = np.random.choice(len(grid.actions))
                action = grid.actions[idx]
            else:
                idx = np.argmax(q[state])
                action = grid.actions[idx]

            print("BEFORE : ",action)
            action = grid.stochasticMove(action,p)#ecoute pas forcement
            print("AFTER : ",action)
            
            (n_state, reward) = grid.P.get((state, action)) # nouveau état en fonction de l'action
            old_q = q[state][idx] #on récupère Q
            next_max = np.max(q[n_state])
            
            new_q = (1-alpha) * old_q + alpha *(reward + gamma * next_max)
            
            q[state][idx] = new_q
            state = n_state
            
            DELTA = max(DELTA, np.abs(old_q - q[state][idx])) #convergence
            converged = True if DELTA < theta else False     
            
        for state in grid.state_space: #pour chaque action donnée
            best_action_idx = np.argmax(q[state])
            qPolicy[state] = grid.actions[best_action_idx]#on note la meilleure policy
    
    print("Iteration Q : ",a)
    return q,qPolicy

if __name__ == '__main__':

    grid_size = (4,3)#ligne et colonne de la grid
    items = {'fire': {'reward': -1, 'loc': [7]},#on place le le feu froid et block
             'water': {'reward': 1, 'loc': [3]},
             'block': {'reward': 0, 'loc': [5]}}

    gamma = 0.99
    theta = 1e-10
    p_stoch = 0.8 #proba d'avancé en avant les autre seront la moitié du reste
    epsilon=0.1
    alpha=0.1
    
    v = np.zeros(np.prod(grid_size))
    policy = np.full(np.prod(grid_size), 'n')
    
    env = GridWorld(grid_size, items)
    
    q = np.zeros((np.prod(grid_size),len(env.actions)))
    qPolicy = np.full(np.prod(grid_size), 'n')
    

    v, policy = interate_values(env, v, policy, gamma, theta, p_stoch)
    q, qPolicy = interate_qValues(env,q,qPolicy, gamma,theta, p_stoch,epsilon,alpha)
    
    print("V")
    print_v(v, env)
    print_policy(v, policy, env)
    
    print("Q")
    print_q_policy(q,qPolicy, env)
    # print_v(q, env)
    # print_policy(q, qPolicy, env)

275 iterations of state space
BEFORE :  U
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  L
AFTER :  R
BEFORE :  U
AFTER :  R
BEFORE :  D
AFTER :  R
BEFORE :  R
AFTER :  R
BEFORE :  L
AFTER :  R
BEFO

KeyboardInterrupt: 