# Training for the Simple pendulum inverse equilibrium

In theory it could be solved by PID, but let's use big weapon to solve these problem

## Environment configuration 

In [5]:
import tensorflow as tf

print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:5', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:6', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:7', device_type='GPU')]


In [6]:
from scipy import interpolate
from function.Dynamics_modeling import *
from function.Euler_lagrange import *
from function.Render import *
from function.Catalog_gen import *

from function.ray_env_creator import *

from ray.rllib.algorithms.ppo import PPOConfig
import ray
from ray import tune,train
from ray.rllib.algorithms.ppo import PPO

import pprint

# Single pendulum exclusive.....

# Initialisation du modèle théorique

t = sp.symbols("t")

CoordNumb = 1

Symb = Symbol_Matrix_g(CoordNumb,t)

theta = Symb[1,0]
theta_d = Symb[2,0]
theta_dd = Symb[3,0]

m, l, g = sp.symbols("m l g")

L = 0.2
Substitution = {"g": 9.81, "l": L, "m": 0.1}

Time_end = 14

#----------------External Forces--------------------

F_ext_time = np.array([0, 2, 4, 6, 8, Time_end])
F_ext_Value = np.array([[0, 1, -1, 1, 1, -1]]) * 0.0  # De la forme (k,...)

F_ext_func = interpolate.CubicSpline(F_ext_time, F_ext_Value, axis=1)
# ---------------------------

Y0 = np.array([[2, 0]])  # De la forme (k,2)

L_System = m*l**2/2*theta_d**2+sp.cos(theta)*l*m*g

Acc_func,_ = Lagrangian_to_Acc_func(L_System, Symb, t, Substitution, fluid_f=[-0.008])

Dynamics_system = Dynamics_f_extf(Acc_func)

EnvConfig = {
    "coord_numb": CoordNumb,
    "target":np.array([np.pi,0]),
    "dynamics_function_h":Dynamics_system,
    "h":0.02
}

ray.shutdown()
#ray.init(num_gpus=2,num_cpus=1,dashboard_host="0.0.0.0")
ray.init(num_gpus=3,num_cpus=0,dashboard_host="0.0.0.0")

2024-07-08 11:20:10,026	INFO worker.py:1762 -- Started a local Ray instance. View the dashboard at [1m[32m10.240.77.20:8265 [39m[22m


0,1
Python version:,3.10.12
Ray version:,2.31.0
Dashboard:,http://10.240.77.20:8265


## Hyperparameter tuning

Know we can do the training for our algorithm

In [9]:
config = (
    PPOConfig().environment(
        # Env class to use (here: our gym.Env sub-class from above).
        env=MyFunctionEnv,
        env_config=EnvConfig,
    )
    .framework("tensorflow")
    # Parallelize environment rollouts.
    .env_runners(num_env_runners=10,num_cpus_per_env_runner=1, num_gpus_per_env_runner=1/20)
    .training(lr=tune.grid_search([0.001, 0.0001, 0.0005]),gamma=0.9,entropy_coeff=tune.grid_search([0.0001, 0.005, 0.0005]))
)

tuner = tune.Tuner(
    "PPO",
    run_config=train.RunConfig(
        stop={"training_iteration": 8},
    ),
    param_space=config,
)

results = tuner.fit()
# Get the best result based on a particular metric.
best_result = results.get_best_result(
    metric="env_runners/episode_return_mean", mode="max"
)

# Get the best checkpoint corresponding to the best result.
best_checkpoint = best_result.checkpoint

pprint.pp(best_result)
pprint.pp(best_checkpoint)

0,1
Current time:,2024-07-08 11:34:34
Running for:,00:09:39.91
Memory:,15.4/251.7 GiB

Trial name,status,loc,entropy_coeff,lr
PPO_MyFunctionEnv_43268_00000,PENDING,,0.0001,0.001
PPO_MyFunctionEnv_43268_00001,PENDING,,0.005,0.001
PPO_MyFunctionEnv_43268_00002,PENDING,,0.0005,0.001
PPO_MyFunctionEnv_43268_00003,PENDING,,0.0001,0.0001
PPO_MyFunctionEnv_43268_00004,PENDING,,0.005,0.0001
PPO_MyFunctionEnv_43268_00005,PENDING,,0.0005,0.0001
PPO_MyFunctionEnv_43268_00006,PENDING,,0.0001,0.0005
PPO_MyFunctionEnv_43268_00007,PENDING,,0.005,0.0005
PPO_MyFunctionEnv_43268_00008,PENDING,,0.0005,0.0005




Now we can do more training for our policy

In [39]:
from ray.rllib.algorithms.algorithm import Algorithm
ray.shutdown()
ray.init()

my_new_ppo = Algorithm.from_checkpoint("/home/eymeric/ray_checkpoints/checkpoint_exp2_0003")

2024-07-05 15:42:55,960	INFO worker.py:1762 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


In [44]:
config = (
    PPOConfig().environment(
        # Env class to use (here: our gym.Env sub-class from above).
        env=MyFunctionEnv,
        env_config=EnvConfig,
    )
    .framework("torch")
    # Parallelize environment rollouts.
    .env_runners(num_env_runners=10,num_cpus_per_env_runner=1, num_gpus_per_env_runner=1 / 16)
    .training(lr=0.0001,gamma=0.99,entropy_coeff=0.0005)
)
my_new_ppo = config.build()

2024-07-05 15:46:37,479	INFO worker.py:1762 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2024-07-05 15:46:48,708	INFO trainable.py:161 -- Trainable.setup took 13.623 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [46]:
# Continue training
expname = "checkpoint_exp2_0004"

for i in range(40):
    results = my_new_ppo.train()
    print(f"Iter: {i}; avg. return={results['env_runners']['episode_return_mean']}")

    if i%5 == 4:
        save_result = my_new_ppo.save("/home/eymeric/ray_checkpoints/"+expname+"_"+str(i))
        path_to_checkpoint = save_result.checkpoint.path
        print(
            "An Algorithm checkpoint has been created inside directory: "
            f"'{path_to_checkpoint}'."
        )
    

save_result = my_new_ppo.save("/home/eymeric/ray_checkpoints/"+expname)
path_to_checkpoint = save_result.checkpoint.path
print(
    "An Algorithm checkpoint has been created inside directory: "
    f"'{path_to_checkpoint}'."
)

Iter: 0; avg. return=-85491.3896330545
Iter: 1; avg. return=-84836.84465217346
Iter: 2; avg. return=-83923.36304463653
Iter: 3; avg. return=-83722.67444430316
Iter: 4; avg. return=-83430.8594234107
An Algorithm checkpoint has been created inside directory: '/home/eymeric/ray_checkpoints/checkpoint_exp2_0003_4'.
Iter: 5; avg. return=-83059.80211288128
Iter: 6; avg. return=-83018.01161476994
Iter: 7; avg. return=-82959.55318097994
Iter: 8; avg. return=-82657.10099445778
Iter: 9; avg. return=-82578.95456013262
An Algorithm checkpoint has been created inside directory: '/home/eymeric/ray_checkpoints/checkpoint_exp2_0003_9'.
Iter: 10; avg. return=-82606.43244305672
Iter: 11; avg. return=-82740.58396031294
Iter: 12; avg. return=-82908.7623185928
Iter: 13; avg. return=-82904.55734049258
Iter: 14; avg. return=-82500.39605127934
An Algorithm checkpoint has been created inside directory: '/home/eymeric/ray_checkpoints/checkpoint_exp2_0003_14'.
Iter: 15; avg. return=-82290.23416089211
Iter: 16; a

In [36]:
%matplotlib widget
from ray.rllib.algorithms.algorithm import Algorithm
import matplotlib.pyplot as plt
import numpy as np

stop = False
Environment = MyFunctionEnv(EnvConfig)

my_new_ppo = Algorithm.from_checkpoint("/tmp/tmpoonj8141")


while not stop:

    action = my_new_ppo.compute_single_action(Environment.state)

    state, reward, stop, truncated,_ = Environment.step(action)

    print(state, reward, action, stop, truncated)

    Environment.render()

ValueError: Given checkpoint (/tmp/tmpoonj8141) not found! Must be a checkpoint directory (or a file for older checkpoint versions).

In [47]:
ray.shutdown()