In [1]:
import os
print(os.getcwd())
def update_working_directory():
    from pathlib import Path
    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)
update_working_directory()

/Users/admin/Projects/doggo/notebooks
/Users/admin/Projects/doggo


In [2]:
import numpy as np
from tqdm import tqdm
import json
import sys

from src.data.env_dog import get_env_actions, get_env_space, env_reset, get_happiness, env_step

from src.features.rl_tools import init_Q, get_epsilon, get_alpha, epsilon_greedy
from src.features.rl_tools import init_steps_per_state, update_steps_per_state
from src.features.rl_tools_during_training import get_default_checking, launch_checking, save_models, display_analyse_Q

from src.models.value_based.sarsa.rl_tools_sarsa import update_Q_SARSA

from src.utils.plots import get_info_params, define_info_params_dict
from src.visualization.rl_plots_evolution import plot_evolution_reward, plot_evolution_steps
from src.visualization.plot_happiness import plot_evolution_happiness

In [3]:
method_id = 'sarsa_1'

In [4]:
# Parametrisation
params = json.loads(open('src/models/value_based/sarsa/{}.json'.format(method_id)).read())

info_params_dict = define_info_params_dict(params, method_id)
info_params = get_info_params(info_params_dict)

In [5]:
info_params

'method: SARSA - method_id: sarsa_1 - n_episodes: 0/1000 -  \n gamma: 0.95 - nmax_steps: 1440 - init_Q_type: optimum -  \n start_at_random: False - init_epsilon: 0.1 - decay_epsilon: fixed -  \n min_epsilon: 0.1 - init_alpha: 0.5 - decay_alpha: fixed -  \n min_alpha: 0.5 - KLdiv_convergence: 0.0001'

In [6]:
# Initializing environment
action_dict, n_actions = get_env_actions()
_, n_states = get_env_space()

# Initializing the Q-matrix
Q = init_Q(n_actions, params)
Q_saved = Q.copy()

steps_per_state = init_steps_per_state()

# Visualisation
n_episodes_save = int(np.ceil(params['n_episodes'] / params['update_episode_division']))

# Initializing the reward
evo_training = {
    'evo_avg_reward_per_step': []
    , 'evo_min_reward_per_step': []
    , 'evo_max_reward_per_step': []
    , 'evo_n_steps': []
    , 'evo_avg_happiness': []
    , 'evo_min_happiness': []
    , 'evo_max_happiness': []
    , 'checking': get_default_checking()
}

In [7]:
# Training

# Starting the learning
pbar = tqdm(total = params['n_episodes'])
convergence, episode = False, 0

while (not convergence) & (episode < params['n_episodes']):

    info_params_dict['n_episodes'] = '{}/{}'.format(episode+1, params['n_episodes'])
    info_params = get_info_params(info_params_dict)

    evo_episode = {
        'n_episode_steps': 0
        , 'done': False
        , 'episode_sum_reward': 0
        , 'episode_sum_happiness': 0
        , 'episode_step_reward': []
        , 'episode_step_happiness': []
    }

    # Get episode

    state1 = env_reset(params['start_at_random'])
    evo_episode['episode_sum_happiness'] += get_happiness(state1)

    # Update parameters
    epsilon = get_epsilon(
        params_epsilon=params['epsilon'], episode=episode, steps_state=steps_per_state[state1['state_id']])
    evo_training['checking']['evo_epsilon'].append(epsilon)

    alpha = get_alpha(
        params_alpha=params['alpha'], episode=episode, steps_state=steps_per_state[state1['state_id']])
    evo_training['checking']['evo_alpha'].append(alpha)

    action1 = epsilon_greedy(Q, state1['state_id'], n_actions, epsilon)
    steps_per_state = update_steps_per_state(steps_per_state, state1['state_id'])

    while (not evo_episode['done']) and (evo_episode['n_episode_steps'] < params['nmax_steps']):

        # Getting the next state
        state2, reward1, evo_episode['done'], info = env_step(state1, action1)
        evo_episode['episode_sum_reward'] += reward1
        evo_episode['episode_step_reward'].append(reward1)
        evo_episode['episode_sum_happiness'] += get_happiness(state2)
        evo_episode['episode_step_happiness'].append(get_happiness(state2))

        # Update parameters
        epsilon = get_epsilon(
            params_epsilon=params['epsilon'], episode=episode, steps_state=steps_per_state[state2['state_id']])
        evo_training['checking']['evo_epsilon'].append(epsilon)

        # Choosing the next action
        action2 = epsilon_greedy(Q, state2['state_id'], n_actions, epsilon)
        steps_per_state = update_steps_per_state(steps_per_state, state2['state_id'])

        # Learning the Q-value
        alpha = get_alpha(
            params_alpha=params['alpha'], episode=episode, steps_state=steps_per_state[state1['state_id']])
        method_params = {'alpha': alpha, 'gamma': params['gamma']}
        Q = update_Q_SARSA(Q, state1['state_id'], action1, reward1, state2['state_id'], action2, method_params)

        # Updating the respective values
        state1 = state2
        action1 = action2
        evo_episode['n_episode_steps'] += 1

    # At the end of learning process
    if params['render_episode']:
        print('Episode {}, Avg Reward : {}, Timesteps: {}, Epsilon: {}, Alpha: {}'.format(
            episode + 1,
            evo_episode['episode_sum_reward'] / evo_episode['n_episode_steps'],
            evo_episode['n_episode_steps'], epsilon, alpha))

    evo_training['evo_avg_reward_per_step'].append(evo_episode['episode_sum_reward'] / evo_episode['n_episode_steps'])
    evo_training['evo_min_reward_per_step'].append(min(evo_episode['episode_step_reward']))
    evo_training['evo_max_reward_per_step'].append(max(evo_episode['episode_step_reward']))
    evo_training['evo_n_steps'].append(evo_episode['n_episode_steps'])
    evo_training['evo_avg_happiness'].append(evo_episode['episode_sum_happiness'] / evo_episode['n_episode_steps'])
    evo_training['evo_min_happiness'].append(min(evo_episode['episode_step_happiness']))
    evo_training['evo_max_happiness'].append(max(evo_episode['episode_step_happiness']))

    if (episode + 1) % n_episodes_save == 0:

        stop
        evo_training['checking'], Q_saved = launch_checking(
            evo_training['checking'], Q_saved, Q, method_id, info_params, final=False)
        save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=False)

        plot_evolution_reward(
            evo_training['evo_avg_reward_per_step'],
            evo_training['evo_min_reward_per_step'], evo_training['evo_max_reward_per_step'],
            method_id, info_params, final=False)
        plot_evolution_steps(evo_training['evo_n_steps'], method_id, params['nmax_steps'], info_params, final=False)
        plot_evolution_happiness(
            evo_training['evo_avg_happiness'],
            evo_training['evo_min_happiness'], evo_training['evo_max_happiness'],
            method_id, info_params, final=False)
        
        convergence = define_training_convergence(evo_training['checking']['evo_KLdiv'][-1], params)
    
    episode += 1
    pbar.update(1)

pbar.close()

 10%|▉         | 99/1000 [00:01<00:09, 92.34it/s]

NameError: name 'stop' is not defined

In [None]:
def define_training_convergence(last_KLdiv, params):
    return last_KLdiv < params['KLdiv_convergence']

In [8]:
evo_training['checking']

{'evo_epsilon': [0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1,
  0.1

 10%|▉         | 99/1000 [00:20<00:09, 92.34it/s]

In [None]:
save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=True)
plot_evolution_reward(evo_training['evo_avg_reward_per_step'], method_id, info_params)
plot_evolution_steps(evo_training['evo_n_steps'], method_id, params['nmax_steps'], info_params)
plot_evolution_happiness(evo_training['evo_avg_happiness'], method_id, info_params)
display_analyse_Q(Q, method_id, info_params)

# Analysis

In [None]:
from tqdm import tqdm

pbar = tqdm(total=100)
for i in range(10):
    pbar.update(10)
pbar.close()