# Apprentissage par Renforcement - Programmation Dynamique

**Auteurs** :  
BIZZOZZERO Nicolas  
ADOUM Robert 

In [1]:
import sys
import argparse
import copy
import json
import time

import matplotlib.pyplot as plt
import numpy as np

import matplotlib
# matplotlib.use("TkAgg")
import gym
from gym import wrappers, logger

import envs
from randomAgent import RandomAgent

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/usr/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File

In [3]:
def test_function(function=RandomAgent, env_id='gridworld-v0', outdir_path='gridworld-v0/random-agent-results',
                  plan_path="gridworldPlans/plan0.txt", dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1},
                  episode_count=10000, gamma=0.1, verbose=False, show_agent=True, seed=None):
    # You can set the level to logger.DEBUG or logger.WARN if you
    # want to change the amount of output.
    logger.set_level(logger.INFO)

    envx = gym.make(env_id)
    outdir = outdir_path 
    env = wrappers.Monitor(envx, directory=outdir, force=True, video_callable=False)
    
    if seed is not None:
        env.seed(seed)

    reward = 0
    done = False
    envx.verbose = True
    
    moyenne_score = 0
    moyenne_actions = 0

    envx.setPlan(plan_path, dico_rewards)
    if function == RandomAgent:
        agent = function(envx.action_space)
    else:
        agent = function(envx.action_space , envx.getMDP(), gamma=gamma)

    rsum=0
    for i in range(episode_count):
        ob = env.reset()

        if i % 100 == 0 and i > 0 and show_agent:
            envx.verbose = True
        else:
            envx.verbose = False

        if envx.verbose:
            envx.render(1)
        j = 0

        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            
            rsum += reward
            j += 1
            if envx.verbose:
                envx.render()
            if done:
                if verbose:
                    print(str(i), "rsum=" + str(rsum) + ",", str(j), "actions")
                moyenne_actions += j
                moyenne_score += rsum
                
                rsum=0
                break
    env.close()
    return moyenne_actions / episode_count, moyenne_score / episode_count

## Policy iteration

In [12]:
class PolicyIterationAgent(object):
    def __init__(self, action_space, MDP, eps= 0.000001, gamma=0.9):
        self.eps = eps
        self.action_space = action_space
        self.all_states = [s for s in MDP[0]] # Ici sont compris les etats terminaux
        self.state = [s for s in MDP[1]] # Ici non
        self.P = MDP[1]
        self.gamma = gamma
        
        self.pi = {s: None for s in self.state} 
        pi_kplus = {s: self.action_space.sample() for s in self.state}

        while self.pi != pi_kplus:
            self.pi = dict(pi_kplus) # Copie
            Vpi_t = np.array([np.random.random(1)[0] for _ in range(len(self.all_states))])
            Vpi_tplus = np.array([0. for i in range(len(self.all_states))])

            for s in self.state:
                Vpi_tplus[self.state.index(s)] = np.sum([proba*(reward + self.gamma*Vpi_t[self.all_states.index(s_prime)]) 
                    for proba, s_prime, reward, boolean in self.P[s][self.pi[s]]])

            while np.linalg.norm((Vpi_t - Vpi_tplus), ord=np.inf) > self.eps:
                Vpi_t = Vpi_tplus.copy()
                for s in self.state:
                    Vpi_tplus[self.state.index(s)] = np.sum([proba*(reward + self.gamma*Vpi_t[self.all_states.index(s_prime)]) 
                        for proba, s_prime, reward, boolean in self.P[s][self.pi[s]]])
        
            for s in self.state:
                pi_kplus[s] = np.argmax(
                    [np.sum([proba * (reward + self.gamma * Vpi_t[self.all_states.index(s_prime)]) 
                        for proba, s_prime, reward, boolean in self.P[s][a]]) for a in range(self.action_space.n)])
        
        self.pi = dict(pi_kplus)
    
    def act(self, observation, reward, done):
        return self.pi[observation.dumps()]

## Value iteration

In [13]:
class ValueIterationAgent(object):
    def __init__(self, action_space, MDP, eps= 0.000001, gamma=0.09):
        self.eps = eps
        self.action_space = action_space
        self.all_states = [s for s in MDP[0]] # Ici sont compris les etats terminaux
        self.state = [s for s in MDP[1]] # Ici non
        self.P = MDP[1]
        self.gamma = gamma
        
        self.pi = {s: None for s in self.state} 
        
        Vpi_t = np.array([np.random.random(1)[0] for _ in range(len(self.all_states))])
        Vpi_tplus = np.array([0. for i in range(len(self.all_states))])

        for s in self.state:
            Vpi_tplus[self.state.index(s)] = np.max([np.sum([proba*(reward + self.gamma*Vpi_t[self.all_states.index(s_prime)]) 
                for proba, s_prime, reward, boolean in self.P[s][a]]) for a in range(self.action_space.n)])

        while np.linalg.norm((Vpi_t - Vpi_tplus), ord=np.inf) > self.eps:
            Vpi_t = Vpi_tplus.copy()
            for s in self.state:
                Vpi_tplus[self.state.index(s)] = np.max([np.sum([proba*(reward + self.gamma*Vpi_t[self.all_states.index(s_prime)]) 
                for proba, s_prime, reward, boolean in self.P[s][a]]) for a in range(self.action_space.n)])
        
        for s in self.state:
            self.pi[s] = np.argmax(
                [np.sum([proba * (reward + self.gamma * Vpi_t[self.all_states.index(s_prime)]) 
                    for proba, s_prime, reward, boolean in self.P[s][a]]) for a in range(self.action_space.n)])
    
    def act(self, observation, reward, done):
        return self.pi[observation.dumps()]

## Recherche du meilleur $\gamma$

In [6]:
def gamma_research(function, plan_path, min_gamma=0, max_gamma=1, step=0.1, verbose=True):
    list_actions = []
    list_score = []
    gamma = min_gamma
    while gamma <= 1:
        if verbose:
            print(function, "gamma", gamma)
        a, c = test_function(function=function, show_agent=False, gamma=gamma)
        list_actions.append(a)
        list_score.append(c)
        gamma += step
    return list_actions, list_score

In [7]:
def graph(min_gamma=0, max_gamma=1, step=0.1, plan_path="gridworldPlans/plan0.txt", verbose=True):
    x = np.arange(min_gamma, max_gamma + step, step)
    list_action, list_score = gamma_research(PolicyIterationAgent, plan_path=plan_path,
                                    min_gamma=min_gamma, max_gamma=max_gamma, step=step, verbose=verbose)
    list_action2, list_score2 = gamma_research(ValueIterationAgent, plan_path=plan_path,
                                    min_gamma=min_gamma, max_gamma=max_gamma, step=step, verbose=verbose)
    plt.plot(x, list_action, 'r-', label='Policy Iteration Nombre d\'action moyen')
    plt.plot(x, list_score,  'r--', label='Policy Iteration Score moyen')
    
    plt.plot(x, list_action2, 'b-', label='Value Iteration Nombre d\'action moyen')
    plt.plot(x, list_score2,  'b--', label='Value Iteration Score moyen')
    plt.xlabel("gamma")
    plt.legend()
    plt.show()  

In [None]:
%matplotlib inline
graph(verbose=False)

## Test de plusieurs politiques

In [19]:
t1 = time.time()
res = test_function(function=RandomAgent, show_agent=False, seed=0,
                    outdir_path='gridworld-v0/random-agent-results',
                    plan_path="gridworldPlans/plan0.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Politique : Random\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    outdir_path='gridworld-v0/random-agent-results',
                    plan_path="gridworldPlans/plan0.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Politique : Policy Iteration\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=ValueIterationAgent, show_agent=False, seed=0,
                    outdir_path='gridworld-v0/random-agent-results',
                    plan_path="gridworldPlans/plan0.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Politique : Value Iteration\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).")

INFO: Making new env: gridworld-v0
['1 1 1 1 1 1\n', '1 0 0 0 3 1\n', '1 0 1 0 5 1\n', '1 0 0 0 2 1\n', '1 1 1 1 1 1\n', '1 1 1 1 1 1']
['1 1 1 1 1 1\n', '1 0 0 0 3 1\n', '1 0 1 0 5 1\n', '1 0 0 0 2 1\n', '1 1 1 1 1 1\n', '1 1 1 1 1 1']
INFO: Finished writing results. You can upload them to the scoreboard via gym.upload('C:\\Users\\Nicolas\\Documents\\courses\\[5I853] FDMS - Fouille de Données et Médias Sociaux\\TD-TME\\TME8\\gridworld-v0\\random-agent-results')
Politique : Random
Moyenne actions : 11.0157 
Moyenne scores : -0.7824157000000278 
Temps : 9.709 seconde(s).

INFO: Making new env: gridworld-v0
['1 1 1 1 1 1\n', '1 0 0 0 3 1\n', '1 0 1 0 5 1\n', '1 0 0 0 2 1\n', '1 1 1 1 1 1\n', '1 1 1 1 1 1']
INFO: Clearing 2 monitor files from previous run (because force=True was provided)
['1 1 1 1 1 1\n', '1 0 0 0 3 1\n', '1 0 1 0 5 1\n', '1 0 0 0 2 1\n', '1 1 1 1 1 1\n', '1 1 1 1 1 1']
INFO: Finished writing results. You can upload them to the scoreboard via gym.upload('C:\\Users\\Nicol

Les deux politiques semblent prendre approximativement le même temps pour converger ainsi que les mêmes scores.

## Test de plusieurs plans

In [22]:
t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan1.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 1\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan2.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 2\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan3.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 3\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan4.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 4\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan5.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 5\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan6.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 6\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan7.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 7\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan8.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 8\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

t1 = time.time()
res = test_function(function=PolicyIterationAgent, show_agent=False, seed=0,
                    plan_path="gridworldPlans/plan9.txt",
                    dico_rewards={0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
t2 = time.time()
print("Plan : 9\nMoyenne actions :", res[0], "\nMoyenne scores :", res[1], "\nTemps :", round(t2 - t1, 3), "seconde(s).\n")

INFO: Making new env: gridworld-v0
['1 1 1 1 1 1\n', '1 0 0 0 3 1\n', '1 0 1 0 5 1\n', '1 0 0 0 2 1\n', '1 1 1 1 1 1\n', '1 1 1 1 1 1']
INFO: Clearing 2 monitor files from previous run (because force=True was provided)
['1 1 1 1 1 1\n', '1 0 0 0 3 1\n', '1 0 1 4 5 1\n', '1 0 0 0 2 1\n', '1 1 1 1 1 1\n']
INFO: Finished writing results. You can upload them to the scoreboard via gym.upload('C:\\Users\\Nicolas\\Documents\\courses\\[5I853] FDMS - Fouille de Données et Médias Sociaux\\TD-TME\\TME8\\gridworld-v0\\random-agent-results')
Plan : 1
Moyenne actions : 510.1851 
Moyenne scores : 0.4950143 
Temps : 840.842 seconde(s).

