In [3]:
import optuna

import numpy as np

from algorithms import DoubleDQN
from environment import CreditPayerEnv
from pipeline import MetricsStudy

In [4]:
def eval_trajectory(env: CreditPayerEnv, agent: DoubleDQN, trajectory_max_len: int):
    total_reward = 0
    state = env.reset()
    for i in range(trajectory_max_len):
        action = agent.get_action(state)

        state, reward, done, _ = env.step(action)

        total_reward += reward

        if done:
            break

    return total_reward

In [5]:
def eval_model(
    env: CreditPayerEnv, agent: DoubleDQN, trajectory_max_len: int, repeat_count: int
):
    rewards = []
    for i in range(repeat_count):
        reward = eval_trajectory(env, agent, trajectory_max_len)
        rewards.append(reward)

    return np.mean(rewards)

In [6]:
def double_objective(trial: optuna.Trial):
    env = CreditPayerEnv()
    state_dim = env.state_dim
    action_n = env.action_n

    episode_n = 5
    trajectory_max_len = 1

    epsilon_decrease = 1 / episode_n / trajectory_max_len

    gamma = trial.suggest_float("gamma", 0.9, 1)
    tau = trial.suggest_float("tau", 0.001, 0.999)

    batch_size = trial.suggest_int('batch_size', 60, 70)

    agent = DoubleDQN(
        state_dim,
        action_n,
        gamma=gamma,
        lr=0.001,
        tau=tau,
        batch_size=batch_size,
        epsilon_decrease=epsilon_decrease,
        epsilon_min=0,
    )
    study = MetricsStudy(env, agent, trajectory_max_len)

    study.study_agent(episode_n=5)

    agent.epsilon_min = 0
    agent.epsilon = 0

    return eval_model(env, agent, trajectory_max_len, repeat_count=10)

In [7]:
double_study = optuna.create_study(direction="maximize")
double_study.optimize(double_objective, n_trials=5)

trial = double_study.best_trial

print("Reward: {}".format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2024-01-02 22:12:21,005] A new study created in memory with name: no-name-28e4442f-c895-4d49-8298-615ecd56f320
[I 2024-01-02 22:12:21,491] Trial 0 finished with value: -10.0 and parameters: {'gamma': 0.9441913664249364, 'tau': 0.7332671022238211, 'batch_size': 68}. Best is trial 0 with value: -10.0.
[I 2024-01-02 22:12:21,511] Trial 1 finished with value: -10.0 and parameters: {'gamma': 0.9942015089129072, 'tau': 0.7978250947605702, 'batch_size': 64}. Best is trial 0 with value: -10.0.
[I 2024-01-02 22:12:21,527] Trial 2 finished with value: -10.0 and parameters: {'gamma': 0.9399746490699319, 'tau': 0.980756365349372, 'batch_size': 67}. Best is trial 0 with value: -10.0.
[I 2024-01-02 22:12:21,545] Trial 3 finished with value: -10.0 and parameters: {'gamma': 0.99677567237019, 'tau': 0.6321382125845849, 'batch_size': 68}. Best is trial 0 with value: -10.0.
[I 2024-01-02 22:12:21,559] Trial 4 finished with value: 1.0 and parameters: {'gamma': 0.9025580070286394, 'tau': 0.3229384824470

Reward: 1.0
Best hyperparameters: {'gamma': 0.9025580070286394, 'tau': 0.32293848244704854, 'batch_size': 65}


In [8]:
optuna.visualization.plot_optimization_history(double_study)

In [9]:
optuna.visualization.plot_slice(double_study)