# FDMS, TME3 - Q-Learning
### Auteur
* BIZZOZZERO Nicolas
* ADOUM Robert

In [1]:
import sys
import argparse
import copy
import json
import time

import numpy as np
import matplotlib
matplotlib.use("TkAgg")
import gym
from gym import wrappers, logger

import envs

# Q-Learning

In [2]:
class QLearning(object):
    def __init__(self, env, eps=0.1, alpha=0.1, gamma=0.7, show=1000, video=False):
        self.gamma = gamma
        self.alpha = alpha
        self.action_space = env.action_space
        self.q_table = dict()
        self.state = dict()
        
        # Application de l'algorithme du Q-Learning
        i = 1
        j = 0
        k = 1
        while True:
            # Initialisation de s0
            state = env.reset().dumps()
            if state not in self.state:
                self.state[state] = j
                j += 1
            
            try:
                self.q_table[self.state[state], 0]
            except KeyError:    
                for action in range(self.action_space.n):
                    self.q_table[self.state[state], action] = 0
                    
            done = False
            
            #ici on teste ce qu'il apprend
            if i % show == 0:
                r = 0
                a = 0
                if video:
                    envx.render(1)
                while not done:
                    action = self.act(state)
                    next_state, reward, done, _ = env.step(action)
                    r += reward
                    a += 1
                    if video:
                        envx.render()
                    state = next_state.dumps()
                print("reward = ", r, "action = ", a)
                    
            #ici il apprend
            else:
                while not done:  
                    if np.random.uniform() < eps:
                        action = self.action_space.sample()
                    else:
                        action = self.act(state)

                    next_state, reward, done, _ = env.step(action)
                    next_state = next_state.dumps()
                    if next_state not in self.state:
                        self.state[next_state] = j
                        j += 1

                    # On regarde si le prochain etat est nouveau
                    try:
                        self.q_table[self.state[next_state], 0]
                    except KeyError:
                        # Le prochain etat est nouveau
                        for action in range(self.action_space.n):
                            self.q_table[self.state[next_state], action] = 0

                    self.q_table[self.state[state], action] += alpha * (reward + gamma * self.get_max(next_state) - self.q_table[self.state[state], action])
                    state = next_state
                    k =+ 1 
            eps /= k
            i += 1

    def act(self, observation):
        return np.argmax([self.q_table[self.state[observation], action] for action in range(self.action_space.n)])
    
    def get_max(self, next_state):
        max_value = 0
        for action in range(self.action_space.n):
            if self.q_table[self.state[next_state], action] > max_value:
                max_value = self.q_table[self.state[next_state], action]
        return max_value

## GridWorld

In [None]:
env_id = 'gridworld-v0'
outdir='output'
logger.set_level(logger.INFO)
envx = gym.make(env_id)
envx.verbose = True
env = wrappers.Monitor(envx, directory=outdir, force=True, video_callable=False)
env.seed(0)
envx.setPlan("gridworldPlans/plan1.txt", {0:-0.001, 3:1, 4:1, 5:-1, 6:-1})
QLearning(env, eps=0.9, alpha=0.5, gamma=0.9, show=1000, video=True)

# Deep Q-Network

In [None]:
import sys
import argparse
import copy
import json
import random
import time
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use("TkAgg")
import gym
from gym import wrappers, logger
# import envs

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torch.nn.functional as F

from collections import deque
import random

from tqdm import tqdm
from DQL import DQL_ER, set_all_seeds

In [2]:
class Q_module(nn.Module):
    def __init__(self, inSize, outSize, layers=[]):
        super(Q_module, self).__init__()
        self.layers = nn.ModuleList([])
        for x in layers:
            self.layers.append(nn.Linear(inSize, x))
            inSize = x
        self.layers.append(nn.Linear(inSize, outSize))

    def forward(self, x):
        x = self.layers[0](x)
        for i in range(1, len(self.layers)):
            x = torch.nn.functional.leaky_relu(x)
            x = self.layers[i](x)
        return x

In [3]:
class DQL_ER(object):
    def __init__(self, inSize, outSize, env, envx, layers=[200],
                 minibatch_size=200, eps=0.5, gamma=0.99, lr=0.001, C=4):
        self.D = deque(maxlen=300)
        self.action_space = env.action_space
        self.Q = Q_module(inSize, outSize, layers)
        self.Q_hat = Q_module(inSize, outSize, layers)
        self.Q_hat.load_state_dict(self.Q.state_dict())
        self.optimizer = optim.Adam(self.Q.parameters(), lr=lr)
        self.loss = nn.MSELoss()  # nn.SmoothL1Loss()

        self.eps = eps
        self.C = C
        self.envx = envx
        self.gamma = gamma
        self.minibatch_size = minibatch_size

    def fit(self, M, graph=True, verbose=False):
        reward_list = []
        loss_list = []
        v = 0
        c = 1
        for m in range(M):
            state = env.reset()
            done = False
            r = 0
            while not done:
                # With probability eps select a random action
                action = self.select_action(state)

                # Execute action and observe reward r and next state
                next_state, reward, done, _ = env.step(action)
                r += reward

                # Si done = True alors 1 - done = 0 et on retrouve bien que
                # target = reward
                target = reward + (1 - done) * self.gamma * \
                    self.get_max(next_state)
                self.D.append([state, action, reward, next_state, target])

                # Gradient descent
                if c >= self.minibatch_size:
                    minibatch = random.sample(self.D, self.minibatch_size)
                    y = torch.tensor([mb[4] for mb in minibatch],
                                     dtype=torch.float32)
                    x = torch.tensor([mb[0] for mb in minibatch],
                                     dtype=torch.float32, requires_grad=True)
                    self.optimizer.zero_grad()
                    y_pred = self.Q(x)
                    loss = self.loss(torch.max(y_pred, 1)[0], y)
                    loss.backward()
                    self.optimizer.step()

                    reward_list.append(r)
                    loss_list.append(loss)

                    if c % self.C == 0:
                        self.Q_hat.load_state_dict(self.Q.state_dict())
                state = next_state
                c += 1

            if verbose:
                print("itération:", v, "reward:", r)

            v += 1
            self.update_eps()

            if c % 100 == 0:
                self.test(self.envx, 1, verbose=True, graph=False)

        if graph:
            x = [i for i in range(1, M + 1)]
            plt.plot(x, reward_list)
            plt.xlabel("Nombre de parties")
            plt.ylabel("Score")
            plt.show()
            plt.plot(x, loss_list)
            plt.xlabel("Nombre de parties")
            plt.ylabel("Loss en fin de partie")
            plt.show()

    def select_action(self, state):
        if np.random.uniform() < self.eps:
            return self.action_space.sample()

        with torch.no_grad():
            pred = self.Q(torch.tensor(state, dtype=torch.float32))
        return torch.argmax(pred).numpy()

    def act(self, state):
        with torch.no_grad():
            pred = self.Q(torch.tensor(state, dtype=torch.float32))
        return torch.argmax(pred).numpy()

    def get_max(self, next_state):
        with torch.no_grad():
            pred = self.Q_hat(torch.tensor(next_state, dtype=torch.float32))
        maxi = torch.max(pred.detach())
        return maxi

    def update_eps(self):
        self.eps -= 0.005

    def test(self, envx, T, graph=True, verbose=False, demo_jeu=False):
        reward_list = []
        for i in range(T):
            if demo_jeu:
                envx.render(1)
            state = env.reset()
            done = False
            r = 0
            while not done:
                action = self.act(state)
                if demo_jeu:
                    envx.render()
                next_state, reward, done, _ = env.step(action)
                r += reward
                state = next_state

            if verbose:
                print(r)
            reward_list.append(r)
        if graph:
            plt.plot([i for i in range(1, T + 1)], reward_list)
            plt.xlabel("Nombre de parties")
            plt.ylabel("Score")
            plt.show()

## CartPole

In [5]:
inSize = 4
outSize = 2
M = 800

env_id = 'CartPole-v1'
outdir = 'cartpole-v0/random-agent-results'
envx = gym.make(env_id)
envx.verbose = True
env = envx
env = wrappers.Monitor(envx, directory=outdir, force=True,
                       video_callable=False)
set_all_seeds(env, seed=1)

dql = DQL_ER(inSize, outSize, env, envx)
dql.fit(M, verbose=True, graph=False)

itération: 0 reward: 12.0
itération: 1 reward: 10.0
itération: 2 reward: 23.0
itération: 3 reward: 14.0
itération: 4 reward: 14.0
itération: 5 reward: 15.0
itération: 6 reward: 9.0
itération: 7 reward: 12.0
itération: 8 reward: 11.0
itération: 9 reward: 17.0
itération: 10 reward: 22.0
itération: 11 reward: 12.0
itération: 12 reward: 10.0
itération: 13 reward: 9.0
itération: 14 reward: 11.0
itération: 15 reward: 10.0
itération: 16 reward: 14.0
itération: 17 reward: 9.0
itération: 18 reward: 10.0
itération: 19 reward: 11.0
itération: 20 reward: 9.0
itération: 21 reward: 13.0
itération: 22 reward: 11.0
itération: 23 reward: 11.0
10.0
itération: 24 reward: 10.0
itération: 25 reward: 9.0
itération: 26 reward: 13.0
itération: 27 reward: 13.0
itération: 28 reward: 10.0
itération: 29 reward: 11.0
itération: 30 reward: 10.0
itération: 31 reward: 10.0
itération: 32 reward: 8.0
itération: 33 reward: 8.0
itération: 34 reward: 12.0
itération: 35 reward: 9.0
itération: 36 reward: 14.0
itération: 37 

## Lunar

In [2]:
eps = 0.1
lr = 0.001
M = 11000
minibatch_size = 32
size_deque = 1000000
layers = [20, 20]
gamma = 0.9


env_id = 'LunarLander-v2'
outdir = 'LunarLander-v2/random-agent-results'
envx = gym.make(env_id)
envx.verbose = True
env = envx
env = wrappers.Monitor(envx, directory=outdir, force=True,
                       video_callable=False)
set_all_seeds(env, seed=0)

inSize = env.observation_space.shape[0]
outSize = env.action_space.n


dql = DQL_ER(inSize, outSize, env, envx, eps=eps, lr=lr, size_deque=size_deque, layers=layers,
             minibatch_size=minibatch_size, gamma=gamma)
dql.fit(M, env, verbose=True, graph=False)

  result = entry_point.load(False)


itération: 0 reward: -177.94488369914325
itération: 1 reward: -217.68622166951582
itération: 2 reward: -118.03362795336396
itération: 3 reward: -119.99196933536234
itération: 4 reward: -76.12515202552902
itération: 5 reward: -141.035038913556
itération: 6 reward: -30.69748257567842
itération: 7 reward: -151.43824016730218
itération: 8 reward: -237.24673526341215
itération: 9 reward: -134.30766633739648
itération: 10 reward: -138.07410363554533
itération: 11 reward: -50.25470187100598
itération: 12 reward: -136.74168139696184
itération: 13 reward: -125.68013398512792
itération: 14 reward: -117.76699475487698
itération: 15 reward: -136.28747306689644
itération: 16 reward: -91.37202432513874
itération: 17 reward: -169.55233106869562
itération: 18 reward: -129.39381372068263
itération: 19 reward: -154.56493085599973
itération: 20 reward: -153.7930716346706
itération: 21 reward: -116.60692787811875
itération: 22 reward: -123.643707219299
itération: 23 reward: -137.29533056529118
itération: 