In [1]:
import os
print(os.getcwd())
def update_working_directory():
    from pathlib import Path
    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)
update_working_directory()

/Users/admin/Projects/doggo/notebooks
/Users/admin/Projects/doggo


In [2]:
import numpy as np
from tqdm import tqdm
import json
import sys

from src.data.env_dog import get_env_actions, get_env_space, env_reset, get_happiness, env_step

from src.features.rl_tools import init_Q, get_epsilon, get_alpha, epsilon_greedy
from src.features.rl_tools import init_steps_per_state, update_steps_per_state
from src.features.rl_tools_during_training import get_default_checking, launch_checking, save_models, display_analyse_Q

from src.models.value_based.sarsa.rl_tools_sarsa import update_Q_SARSA

from src.utils.plots import get_info_params
from src.visualization.rl_plots_evolution import plot_evolution_reward, plot_evolution_steps
from src.visualization.plot_happiness import plot_evolution_happiness

In [3]:
method_id = 'sarsa_1'

In [4]:
# Parametrisation
params = json.loads(open('src/models/value_based/sarsa/{}.json'.format(method_id)).read())

info_params_dict = {
    'method': params['method']
    , 'method_id': method_id
    , 'n_episodes': 0
    , 'gamma': params['gamma']
    , 'nmax_steps': params['nmax_steps']
    , 'init_Q_type': params['init_Q_type']
    , 'start_at_random': params['start_at_random']
    , 'init_epsilon': params['epsilon']['init_epsilon']
    , 'decay_epsilon': params['epsilon']['decay_epsilon']
    , 'min_epsilon': params['epsilon']['min_epsilon']
    , 'init_alpha': params['alpha']['init_alpha']
    , 'decay_alpha': params['alpha']['decay_alpha']
    , 'min_alpha': params['alpha']['min_alpha']
}
info_params = get_info_params(info_params_dict)


In [5]:
info_params

'method: SARSA - method_id: sarsa_1 - n_episodes: 0 -  \n gamma: 0.95 - nmax_steps: 1440 - init_Q_type: optimum -  \n start_at_random: False - init_epsilon: 0.5 - decay_epsilon: fixed -  \n min_epsilon: 0.1 - init_alpha: 1 - decay_alpha: per_episode -  \n min_alpha: 0.5'

In [6]:
# Initializing environment
action_dict, n_actions = get_env_actions()
_, n_states = get_env_space()

# Initializing the Q-matrix
Q = init_Q(n_actions, params)
Q_saved = Q.copy()

steps_per_state = init_steps_per_state()

# Visualisation
n_episodes_save = int(np.ceil(params['n_episodes'] / params['update_episode_division']))

# Initializing the reward
evo_training = {
    'evo_avg_reward_per_step': []
    , 'evo_n_steps': []
    , 'evo_avg_happiness': []
    , 'checking': get_default_checking()
}


In [7]:
# Training

# Starting the learning
for episode in tqdm(range(params['n_episodes'])):

    info_params_dict['n_episodes'] += 1
    info_params = get_info_params(info_params_dict)

    evo_episode = {
        'n_episode_steps': 0
        , 'done': False
        , 'episode_sum_reward': 0
        , 'evolution_sum_happiness': 0
    }

    # Get episode

    state1 = env_reset(params['start_at_random'])
    evo_episode['evolution_sum_happiness'] += get_happiness(state1)

    # Update parameters
    epsilon = get_epsilon(
        params_epsilon=params['epsilon'], episode=episode, steps_state=steps_per_state[state1['state_id']])
    evo_training['checking']['evo_epsilon'].append(epsilon)

    alpha = get_alpha(
        params_alpha=params['alpha'], episode=episode, steps_state=steps_per_state[state1['state_id']])
    evo_training['checking']['evo_alpha'].append(alpha)

    action1 = epsilon_greedy(Q, state1['state_id'], n_actions, epsilon)
    steps_per_state = update_steps_per_state(steps_per_state, state1['state_id'])

    while (not evo_episode['done']) and (evo_episode['n_episode_steps'] < params['nmax_steps']):

        # Getting the next state
        state2, reward1, evo_episode['done'], info = env_step(state1, action1)
        evo_episode['episode_sum_reward'] += reward1
        evo_episode['evolution_sum_happiness'] += get_happiness(state2)
        evo_training['checking']['empirical_return'].append(reward1)
        evo_training['checking']['predicted_value'].append(Q[state1['state_id']][action1])

        # Update parameters
        epsilon = get_epsilon(
            params_epsilon=params['epsilon'], episode=episode, steps_state=steps_per_state[state2['state_id']])
        evo_training['checking']['evo_epsilon'].append(epsilon)

        # Choosing the next action
        action2 = epsilon_greedy(Q, state2['state_id'], n_actions, epsilon)
        steps_per_state = update_steps_per_state(steps_per_state, state2['state_id'])

        # Learning the Q-value
        alpha = get_alpha(
            params_alpha=params['alpha'], episode=episode, steps_state=steps_per_state[state1['state_id']])
        method_params = {'alpha': alpha, 'gamma': params['gamma']}
        Q = update_Q_SARSA(Q, state1['state_id'], action1, reward1, state2['state_id'], action2, method_params)

        # Updating the respective values
        state1 = state2
        action1 = action2
        evo_episode['n_episode_steps'] += 1

    # At the end of learning process
    if params['render_episode']:
        print('Episode {}, Avg Reward : {}, Timesteps: {}, Epsilon: {}, Alpha: {}'.format(
            episode + 1,
            evo_episode['episode_sum_reward'] / evo_episode['n_episode_steps'],
            evo_episode['n_episode_steps'], epsilon, alpha))

    evo_training['evo_avg_reward_per_step'].append(evo_episode['episode_sum_reward'] / evo_episode['n_episode_steps'])
    evo_training['evo_n_steps'].append(evo_episode['n_episode_steps'])
    evo_training['evo_avg_happiness'].append(evo_episode['evolution_sum_happiness'] / evo_episode['n_episode_steps'])

    if (episode + 1) % n_episodes_save == 0:

        evo_training['checking'], Q_saved = launch_checking(evo_training['checking'], Q_saved, Q, method_id, info_params)
        save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=False)

        plot_evolution_reward(evo_training['evo_avg_reward_per_step'], method_id, info_params)
        plot_evolution_steps(evo_training['evo_n_steps'], method_id, params['nmax_steps'], info_params)
        plot_evolution_happiness(evo_training['evo_avg_happiness'], method_id, info_params)
        # display_analyse_Q(Q, params['method'], info_params)


100%|██████████| 1000/1000 [02:27<00:00,  6.77it/s]


In [8]:
save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=True)
plot_evolution_reward(evo_training['evo_avg_reward_per_step'], method_id, info_params)
plot_evolution_steps(evo_training['evo_n_steps'], method_id, params['nmax_steps'], info_params)
plot_evolution_happiness(evo_training['evo_avg_happiness'], method_id, info_params)
display_analyse_Q(Q, method_id, info_params)

100%|██████████| 65275/65275 [11:25<00:00, 95.25it/s] 


In [10]:
import pandas as pd

analyse_Q = pd.DataFrame(columns=[
    'state_id', 'food_id', 'fat_id', 'affection_id', 'action_possible',
    'no_action', 'walking', 'feeding', 'playing',
])

for i in tqdm(Q.keys()):
    i_info = i.split('_')
    Q_info = Q[i]
    analyse_Q = analyse_Q.append({
        'state_id': i,
        'food_id': float(i_info[0]),
        'fat_id': float(i_info[1]),
        'affection_id': float(i_info[2]),
        'action_possible': i_info[3],
        'no_action': Q_info[0],
        'walking': Q_info[1],
        'feeding': Q_info[2],
        'playing': Q_info[3]
        }, ignore_index=True)


100%|██████████| 65275/65275 [12:43<00:00, 85.50it/s] 


In [11]:
analyse_Q

Unnamed: 0,state_id,food_id,fat_id,affection_id,action_possible,no_action,walking,feeding,playing
0,0.5000_0.0000_0.5000_True,0.50,0.00,0.50,True,11.620033,12.551467,12.331499,11.728204
1,0.4900_0.0000_0.4900_True,0.49,0.00,0.49,True,11.742386,11.461034,11.570977,10.867105
2,0.4800_0.0000_0.4800_False,0.48,0.00,0.48,False,11.817934,12.435004,11.899803,11.119914
3,0.4700_0.0000_0.4700_False,0.47,0.00,0.47,False,10.802944,10.973384,10.979741,11.431482
4,0.4600_0.0000_0.4600_False,0.46,0.00,0.46,False,11.452813,10.302360,10.603548,9.803892
...,...,...,...,...,...,...,...,...,...
65270,0.9800_0.9200_0.6500_False,0.98,0.92,0.65,False,20.000000,19.290000,20.000000,20.000000
65271,0.9700_0.9200_0.6400_False,0.97,0.92,0.64,False,20.000000,19.290000,20.000000,20.000000
65272,0.9600_0.9200_0.6300_False,0.96,0.92,0.63,False,20.000000,19.340000,20.000000,20.000000
65273,0.9500_0.8200_0.7000_True,0.95,0.82,0.70,True,20.000000,20.000000,19.540000,20.000000


In [12]:
import matplotlib.pyplot as plt
import statistics
from scipy import stats
import operator
import dill
import pandas as pd
from tqdm import tqdm
from src.utils.plots import COLORS
import numpy as np
from src.utils.maths import running_mean, get_avg_n_points


In [16]:
def display_state_characteristic(analyse_Q, state_characteristic, action_possible, method_id, info_params):

    dfgb = analyse_Q[analyse_Q['action_possible'] == action_possible].groupby(state_characteristic).agg({
        'no_action': 'mean',
        'walking': 'mean',
        'feeding': 'mean',
        'playing': 'mean'
    })
    dfgb.reset_index(inplace=True)

    fig = plt.figure()

    cnt = 0
    for action in ['no_action', 'walking', 'feeding', 'playing']:
        plt.plot(
            dfgb[state_characteristic], dfgb[action], label=action
            , marker='', color=COLORS[cnt], linewidth=1, alpha=0.75
        )
        cnt += 1

    plt.title('Avg Q-Value for state characteristic / action')
    plt.xlabel(state_characteristic + ' \n ' + info_params)
    plt.ylabel('Avg Q-Value')
    # plt.yscale('log')
    plt.xlim([-0.05, 1.05])
    plt.legend(bbox_to_anchor=(0.5, -0.10), loc="lower center",
               bbox_transform=fig.transFigure, ncol=4, fancybox=True, shadow=True, borderpad=1)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('data/figures/{}__Q_value_{}_{}.png'.format(method_id, state_characteristic, action_possible),
                format='png', dpi=1000, bbox_inches='tight')
    # plt.show()
    plt.close(fig)


In [17]:
for state_characteristic in ['food_id', 'fat_id', 'affection_id']:
    print(state_characteristic)
    display_state_characteristic(analyse_Q, state_characteristic, 'True', method_id, info_params)
    display_state_characteristic(analyse_Q, state_characteristic, 'False', method_id, info_params)

food_id
fat_id
affection_id


In [18]:
Q

defaultdict(<function src.features.rl_tools.init_Q.<locals>.get_default_Q_values()>,
            {'0.5000_0.0000_0.5000_True': array([11.62003297, 12.55146659, 12.33149939, 11.72820419]),
             '0.4900_0.0000_0.4900_True': array([11.7423858 , 11.46103396, 11.57097708, 10.86710543]),
             '0.4800_0.0000_0.4800_False': array([11.81793417, 12.43500439, 11.89980272, 11.11991389]),
             '0.4700_0.0000_0.4700_False': array([10.80294422, 10.97338393, 10.97974106, 11.43148152]),
             '0.4600_0.0000_0.4600_False': array([11.45281313, 10.30236011, 10.60354772,  9.8038924 ]),
             '0.4500_0.0000_0.4500_False': array([10.1937416 ,  9.93748608, 10.2958664 , 10.26639306]),
             '0.4400_0.0000_0.4400_False': array([10.47533707, 10.09362521, 10.31354967,  9.84374639]),
             '0.4300_0.0000_0.4300_False': array([10.75975259, 10.06924735, 10.24263353, 10.09446681]),
             '0.4200_0.0000_0.4200_False': array([11.02614613, 10.4960223 , 10.517710

In [19]:
def flatten(l):
    return [item for sublist in l for item in sublist]


def get_avg_Q(Q):

    q_Q = flatten(list(Q.values()))

    if len(q_Q) == 0:
        return 0
    else:
        return np.mean(q_Q)

In [20]:
get_avg_Q(Q)

19.7401728915422