# Training for the Simple pendulum inverse equilibrium

In theory it could be solved by PID, but let's use big weapon to solve these problem

## Environment configuration 

In [1]:
from scipy import interpolate
from function.Dynamics_modeling import *
from function.Euler_lagrange import *
from function.Render import *
from function.Catalog_gen import *

from function.ray_env_creator import *

from ray.rllib.algorithms.ppo import PPOConfig
import ray
from ray import tune,train
from ray.rllib.algorithms.ppo import PPO

import pprint

# Single pendulum exclusive.....

# Initialisation du modèle théorique

t = sp.symbols("t")

CoordNumb = 1

Symb = Symbol_Matrix_g(CoordNumb,t)

theta = Symb[1,0]
theta_d = Symb[2,0]
theta_dd = Symb[3,0]

m, l, g = sp.symbols("m l g")

L = 0.2
Substitution = {"g": 9.81, "l": L, "m": 0.1}

Time_end = 14

#----------------External Forces--------------------

F_ext_time = np.array([0, 2, 4, 6, 8, Time_end])
F_ext_Value = np.array([[0, 1, -1, 1, 1, -1]]) * 0.0  # De la forme (k,...)

F_ext_func = interpolate.CubicSpline(F_ext_time, F_ext_Value, axis=1)
# ---------------------------

Y0 = np.array([[2, 0]])  # De la forme (k,2)

L_System = m*l**2/2*theta_d**2+sp.cos(theta)*l*m*g

Acc_func,_ = Lagrangian_to_Acc_func(L_System, Symb, t, Substitution, fluid_f=[-0.02])

Dynamics_system = Dynamics_f_extf(Acc_func)

EnvConfig = {
    "coord_numb": CoordNumb,
    "target":np.array([np.pi,0]),
    "dynamics_function_h":Dynamics_system,
    "h":0.02
}

## Hyperparameter tuning

Know we can do the training for our algorithm

In [2]:
config = (
    PPOConfig().environment(
        # Env class to use (here: our gym.Env sub-class from above).
        env=MyFunctionEnv,
        env_config=EnvConfig,
    )
    .framework("torch")
    .env_runner(num_cpus_per_env_runner=1, num_gpus_per_env_runner=1 / 16)
    # Parallelize environment rollouts.
    .env_runners(num_env_runners=10)
    .training(lr=tune.grid_search([0.001, 0.0001, 0.0005]),gamma=0.9,entropy_coeff=tune.grid_search([0.001, 0.005, 0.0005]))
)

tuner = tune.Tuner(
    "PPO",
    run_config=train.RunConfig(
        stop={"training_iteration": 4},
    ),
    param_space=config,
)

results = tuner.fit()

TypeError: AlgorithmConfig.resources() got an unexpected keyword argument 'num_cpus_per_env_runner'

In [7]:
# Get the best result based on a particular metric.
best_result = results.get_best_result(
    metric="env_runners/episode_return_mean", mode="max"
)

# Get the best checkpoint corresponding to the best result.
best_checkpoint = best_result.checkpoint

pprint.pp(best_result)
pprint.pp(best_checkpoint)

Result(
  metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 1.5296250750941616, 'cur_kl_coeff': 0.4500000000000001, 'cur_lr': 0.0005000000000000002, 'total_loss': 9.751829777994464, 'policy_loss': -0.04646840618863221, 'vf_loss': 9.786792869978054, 'vf_explained_var': -0.2283534578097764, 'kl': 0.025567411180893562, 'entropy': 1.2679376895709704, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 4185.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 20000, 'num_env_steps_trained': 20000, 'num_agent_steps_sampled': 20000, 'num_agent_steps_trained': 20000}, 'env_runners': {'episode_reward_max': 565.9971998665721, 'episode_reward_min': -1739.6290388900309, 'episode_reward_mean': -497.16871318345716, 'episode_len_mean': 60.21, 'episode_media': {}, 'episodes_timesteps_total': 6021, 'policy_rew

Now we can do more training for our policy

In [8]:
from ray.rllib.algorithms.algorithm import Algorithm

my_new_ppo = Algorithm.from_checkpoint("/home/eymeric/ray_results/PPO_2024-07-04_12-14-50/PPO_MyFunctionEnv_9555f_00006_6_gamma=0.9000,lr=0.0005_2024-07-04_12-14-54/checkpoint_000000")

# Continue training
for i in range(40):
    results = my_new_ppo.train()
    print(f"Iter: {i}; avg. return={results['env_runners']['episode_return_mean']}")

save_result = my_new_ppo.save()
path_to_checkpoint = save_result.checkpoint.path
print(
    "An Algorithm checkpoint has been created inside directory: "
    f"'{path_to_checkpoint}'."
)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2024-07-04 12:31:32,582	INFO trainable.py:161 -- Trainable.setup took 10.290 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Iter: 0; avg. return=-296.9169161957483
Iter: 1; avg. return=-283.9138503574072
Iter: 2; avg. return=-230.25194964932336
Iter: 3; avg. return=-154.0041364411379
Iter: 4; avg. return=-100.64838800039341
Iter: 5; avg. return=-85.47237617110115
Iter: 6; avg. return=-56.08924529654882
Iter: 7; avg. return=-55.48592151975458
Iter: 8; avg. return=-44.75891582197993
Iter: 9; avg. return=-16.918772040000707
Iter: 10; avg. return=-14.441446887020623
Iter: 11; avg. return=-25.544092289124237
Iter: 12; avg. return=-30.356257978641743
Iter: 13; avg. return=24.392487430393775
Iter: 14; avg. return=30.98715258886906
Iter: 15; avg. return=8.9407188190476
Iter: 16; avg. return=36.29991166643517
Iter: 17; avg. return=-6.2262621219443695
Iter: 18; avg. return=0.22359168473684435
Iter: 19; avg. return=20.612284011646256
Iter: 20; avg. return=59.62594613783131
Iter: 21; avg. return=48.49687209477452
Iter: 22; avg. return=55.60378172614796
Iter: 23; avg. return=55.982820766046395
Iter: 24; avg. return=68.0

In [None]:
%matplotlib widget
from ray.rllib.algorithms.algorithm import Algorithm
import matplotlib.pyplot as plt
import numpy as np

stop = False
Environment = MyFunctionEnv(EnvConfig)

my_new_ppo = Algorithm.from_checkpoint("/tmp/tmpoonj8141")


while not stop:

    action = my_new_ppo.compute_single_action(Environment.state)

    state, reward, stop, truncated,_ = Environment.step(action)

    print(state, reward, action, stop, truncated)

    Environment.render()