# Training for the Simple pendulum inverse equilibrium

In theory it could be solved by PID, but let's use big weapon to solve these problem

## Environment configuration 

In [None]:
import tensorflow as tf

print(tf.config.list_physical_devices('GPU'))

In [None]:
from scipy import interpolate
from function.Dynamics_modeling import *
from function.Euler_lagrange import *
from function.Render import *
from function.Catalog_gen import *

from function.ray_env_creator import *

from ray.rllib.algorithms.ppo import PPOConfig
import ray
from ray import tune,train
from ray.rllib.algorithms.ppo import PPO

import pprint

# Single pendulum exclusive.....

# Initialisation du modèle théorique

t = sp.symbols("t")

CoordNumb = 1

Symb = Symbol_Matrix_g(CoordNumb,t)

theta = Symb[1,0]
theta_d = Symb[2,0]
theta_dd = Symb[3,0]

m, l, g = sp.symbols("m l g")

L = 0.2
Substitution = {"g": 9.81, "l": L, "m": 0.1}

Time_end = 14

#----------------External Forces--------------------

F_ext_time = np.array([0, 2, 4, 6, 8, Time_end])
F_ext_Value = np.array([[0, 1, -1, 1, 1, -1]]) * 0.0  # De la forme (k,...)

F_ext_func = interpolate.CubicSpline(F_ext_time, F_ext_Value, axis=1)
# ---------------------------

Y0 = np.array([[2, 0]])  # De la forme (k,2)

L_System = m*l**2/2*theta_d**2+sp.cos(theta)*l*m*g

Acc_func,_ = Lagrangian_to_Acc_func(L_System, Symb, t, Substitution, fluid_f=[-0.008])

Dynamics_system = Dynamics_f_extf(Acc_func)

EnvConfig = {
    "coord_numb": CoordNumb,
    "target":np.array([np.pi,0]),
    "dynamics_function_h":Dynamics_system,
    "h":0.02
}

#ray.shutdown()
#ray.init(num_gpus=2,num_cpus=1,dashboard_host="0.0.0.0")
#ray.init(num_gpus=3,num_cpus=0,dashboard_host="0.0.0.0")

## Hyperparameter tuning

Know we can do the training for our algorithm

Now we can do more training for our policy

In [None]:
from ray.rllib.algorithms.algorithm import Algorithm
ray.shutdown()
ray.init()

my_new_ppo = Algorithm.from_checkpoint("/home/eymeric/ray_checkpoints/checkpoint_exp2_0003")

In [None]:
config = (
    PPOConfig().environment(
        # Env class to use (here: our gym.Env sub-class from above).
        env=MyFunctionEnv,
        env_config=EnvConfig,
    )
    .framework("torch")
    # Parallelize environment rollouts.
    .env_runners(num_env_runners=10,num_cpus_per_env_runner=1, num_gpus_per_env_runner=1 / 16)
    .training(lr=0.0001,gamma=0.99,entropy_coeff=0.0005)
)
my_new_ppo = config.build()

In [None]:
# Continue training
expname = "checkpoint_exp2_0004"

for i in range(40):
    results = my_new_ppo.train()
    print(f"Iter: {i}; avg. return={results['env_runners']['episode_return_mean']}")

    if i%5 == 4:
        save_result = my_new_ppo.save("/home/eymeric/ray_checkpoints/"+expname+"_"+str(i))
        path_to_checkpoint = save_result.checkpoint.path
        print(
            "An Algorithm checkpoint has been created inside directory: "
            f"'{path_to_checkpoint}'."
        )
    

save_result = my_new_ppo.save("/home/eymeric/ray_checkpoints/"+expname)
path_to_checkpoint = save_result.checkpoint.path
print(
    "An Algorithm checkpoint has been created inside directory: "
    f"'{path_to_checkpoint}'."
)

In [None]:
%matplotlib widget
from ray.rllib.algorithms.algorithm import Algorithm
import matplotlib.pyplot as plt
import numpy as np

stop = False
Environment = MyFunctionEnv(EnvConfig)

my_new_ppo = Algorithm.from_checkpoint("/tmp/tmpoonj8141")


while not stop:

    action = my_new_ppo.compute_single_action(Environment.state)

    state, reward, stop, truncated,_ = Environment.step(action)

    print(state, reward, action, stop, truncated)

    Environment.render()

In [None]:
ray.shutdown()