# Script d'entrainement de l'algorithme Q-learning sur le jeu catcher
Inspiré de https://www.datacamp.com/tutorial/introduction-q-learning-beginner-tutorial


### Imports et rechargement des librairies

In [306]:
import numpy as np
import pandas as pd
import os
import sys
import time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src import ql, env
from datetime import datetime

import importlib
importlib.reload(ql)
importlib.reload(env)

<module 'src.env' from 'c:\\Users\\basil\\Documents\\Work\\UQAC\\S2\\Projet\\IV - Projet\\Article 1 - QLearning\\src\\env.py'>

### Paramètres du programme

In [307]:
# Training params - Bellman equation
GAMMA = 0.85
LEARNING_RATE = 0.01

# Eval params
N_EVAL_EPISODES = 100
EVAL_SEED = []

Change these parameters to try gym games. Simply put the name of the gym game such as `FrozenLake-v1`.

In [308]:
game_name = "FrozenLake-v1"
#game_name = "CliffWalking-v0"
#game_name = "Catcher"
#game_name = "Taxi-v3"
if game_name == "Catcher":
    STATE_VERS = 3
    base_score = 5
    max_steps = None
    env = env.env("Catcher")
    EPISODES = 500

else:
    env = env.env(game_name)
    if game_name == "Taxi-v3":
        max_steps = 300
        EPISODES = 50000
    else:
        max_steps = 99
        #env = env.env(game_name, map_name="4x4", is_slippery=False)
        EPISODES = 10000

In [309]:
# Simulation info:
content = "Simulation info : \nNb eps: " + str(EPISODES) + "\n" \
+ "Gamma: " + str(GAMMA) + "\n" \
+ "Game name: " + game_name + "\n"

### Préparation d'un dossier pour enregister les données

In [310]:
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H%M")
dest_dir = os.path.join("..", "simulations", formatted_datetime)
models_dir = os.path.join(dest_dir, "models")
info_file_path = os.path.join(dest_dir, "info.txt")
os.mkdir(dest_dir)
os.mkdir(models_dir)
with open(info_file_path, 'w') as file:
    file.write(content) 

### Création et initialisation de l'environnement de jeu et de l'agent

In [311]:
agent = ql.QLearning(
    state_size=env.state_space,
    action_size=env.action_space,
    learning_rate=LEARNING_RATE, 
    gamma=GAMMA,
    exploration_min=env.exploration_min,
    exploration_max=env.exploration_max,
    exploration_decay=env.exploration_decay
)

### Training phase

In [312]:

state = env.reset()

df = pd.DataFrame(columns=["epoch", "step_num", "score", "agent_er", "time"])

for e in range(EPISODES):
    start_time = time.time()
    state = env.reset()
    step = 0
    done = False
    agent.updateExplorationRate(e)
    while not done:
        action = agent.act(state)

        next_state, reward, done, score = env.step(action)
        agent.update(state=state, new_state=next_state, action=action, 
                     reward=reward)

        state = next_state
        step += 1
        if max_steps != None and step >= max_steps:
            done = True
        if done:
            print(f"Episode {e}/{EPISODES-1} completed with {step} steps. Score: {score:.0f}. EP: {agent.exploration_rate:.2f}.")
            break
    end_time = time.time() - start_time
    print(f"Episode n°{e} done in {step} steps.")
    df.loc[len(df.index)] = [e, step, score, agent.exploration_rate, end_time]
    if e % 5000 == 0:
        agent.save(os.path.join(dest_dir, "models", f"q_table_{e}.npy"))
    if e % 10000 == 0:
        df.to_csv(os.path.join(dest_dir, "output.csv"), index = False)
df.to_csv(os.path.join(dest_dir, "output.csv"), index = False)
agent.save(os.path.join(dest_dir, "models", f"q_table_final.npy"))

Episode 0/9999 completed with 8 steps. Score: 0. EP: 1.00.
Episode n°0 done in 8 steps.
Episode 1/9999 completed with 4 steps. Score: 0. EP: 0.95.
Episode n°1 done in 4 steps.
Episode 2/9999 completed with 7 steps. Score: 0. EP: 0.90.
Episode n°2 done in 7 steps.
Episode 3/9999 completed with 2 steps. Score: 0. EP: 0.86.
Episode n°3 done in 2 steps.
Episode 4/9999 completed with 3 steps. Score: 0. EP: 0.81.
Episode n°4 done in 3 steps.
Episode 5/9999 completed with 9 steps. Score: 0. EP: 0.77.
Episode n°5 done in 9 steps.
Episode 6/9999 completed with 7 steps. Score: 0. EP: 0.74.
Episode n°6 done in 7 steps.
Episode 7/9999 completed with 15 steps. Score: 0. EP: 0.70.
Episode n°7 done in 15 steps.
Episode 8/9999 completed with 2 steps. Score: 0. EP: 0.66.
Episode n°8 done in 2 steps.
Episode 9/9999 completed with 10 steps. Score: 0. EP: 0.63.
Episode n°9 done in 10 steps.
Episode 10/9999 completed with 3 steps. Score: 0. EP: 0.60.
Episode n°10 done in 3 steps.
Episode 11/9999 completed 

Episode 70/9999 completed with 47 steps. Score: 0. EP: 0.03.
Episode n°70 done in 47 steps.
Episode 71/9999 completed with 16 steps. Score: 0. EP: 0.03.
Episode n°71 done in 16 steps.
Episode 72/9999 completed with 5 steps. Score: 0. EP: 0.02.
Episode n°72 done in 5 steps.
Episode 73/9999 completed with 19 steps. Score: 0. EP: 0.02.
Episode n°73 done in 19 steps.
Episode 74/9999 completed with 9 steps. Score: 0. EP: 0.02.
Episode n°74 done in 9 steps.
Episode 75/9999 completed with 6 steps. Score: 0. EP: 0.02.
Episode n°75 done in 6 steps.
Episode 76/9999 completed with 11 steps. Score: 0. EP: 0.02.
Episode n°76 done in 11 steps.
Episode 77/9999 completed with 5 steps. Score: 0. EP: 0.02.
Episode n°77 done in 5 steps.
Episode 78/9999 completed with 13 steps. Score: 0. EP: 0.02.
Episode n°78 done in 13 steps.
Episode 79/9999 completed with 19 steps. Score: 0. EP: 0.02.
Episode n°79 done in 19 steps.
Episode 80/9999 completed with 13 steps. Score: 0. EP: 0.02.
Episode n°80 done in 13 ste

### Evaluation

In [313]:
episode_rewards = []
for episode in range(N_EVAL_EPISODES):
    if EVAL_SEED:
        state = env.reset(seed=EVAL_SEED[episode])
    else:
        state = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
    while not done:
        # Take the action (index) that have the maximum reward
        if not isinstance(state, int):
            state = state[0]
        action = np.argmax(agent.q_table[state][:])
        new_state, reward, done, info = env.step(action)
        total_rewards_ep += reward
        step += 1
        if step > 400:
            done = True
        state = new_state
    episode_rewards.append(total_rewards_ep)


mean_reward = np.mean(episode_rewards)
std_reward = np.std(episode_rewards)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Mean_reward=0.00 +/- 0.00
