In [32]:
# Installs the necessary Python and system libraries
try:
    from easypip import easyimport, easyinstall, is_notebook
except ModuleNotFoundError as e:
    get_ipython().run_line_magic("pip", "install easypip")
    from easypip import easyimport, easyinstall, is_notebook

easyinstall("bbrl>=0.2.2")
easyinstall("swig")
easyinstall("bbrl_gymnasium>=0.2.0")
easyinstall("bbrl_gymnasium[box2d]")
easyinstall("bbrl_gymnasium[classic_control]")
easyinstall("tensorboard")
easyinstall("moviepy")
easyinstall("box2d-kengz")

In [33]:
import os
import sys
from pathlib import Path
import math

from moviepy.editor import ipython_display as video_display
import time
from tqdm.auto import tqdm
from typing import Tuple, Optional
from functools import partial

from omegaconf import OmegaConf
import torch
import bbrl_gymnasium
import tensorflow as tf


import copy
from abc import abstractmethod, ABC
import torch.nn as nn
import torch.nn.functional as F
from time import strftime
OmegaConf.register_new_resolver(
    "current_time", lambda: strftime("%Y%m%d-%H%M%S"), replace=True
)
from bbrl.agents.gymnasium import GymAgent, ParallelGymAgent, make_env, record_video
from gymnasium import logger, spaces
from gymnasium.wrappers import TimeLimit
from env import *

In [36]:
from torch.optim import SGD, Adam
# Initialisation
train_env = gymnasium.make('CartpoleEnvCacla')
train_env = TimeLimit(train_env, max_episode_steps=500)
eval_env = gymnasium.make('CartpoleEnvCacla')
eval_env = TimeLimit(eval_env, max_episode_steps=500)

In [None]:
import numpy as np
import time
from cacla_keras import Cacla_Keras
from datetime import datetime
import pickle
from keras import backend as K

tf.debugging.set_log_device_placement(True)



def test(eval_env, model, n_test = 10, noise_std = 0.3):
    cum_reward = 0
    with torch.no_grad():
        for _ in range(n_test):
            done = False
            truncated = False
            obs0,_ = eval_env.reset()
            while not done and not truncated:
                a0 = model.actor.predict(np.array([obs0]),verbose=0)
                obs0, reward, done,truncated,_ = eval_env.step(a0[0]) 
                cum_reward += reward
    return cum_reward/n_test

def addnoise(x, std):
    return x + np.random.normal(0,std)
    
def train(train_env, eval_env, model, step_max=102400, eval_step=1024,noise_std=0.3):
    observation0 ,_ = train_env.reset()
    count = 0
    scores = []
    for it in range(step_max+1):
        # get current value of value function for observation0
        V0 = model.critic.predict(np.array([observation0]),verbose=0)
        # predict default action
        A0 = model.actor.predict(np.array([observation0]),verbose=0)
        # sample new explored action
        a0 = model.sample(A0[0], model.exploration_factor)

        observation1, reward, done, truncated, info = train_env.step(a0)

        #get current value of value function for observation1 and compute delta.
        V1 = model.critic.predict(np.array([observation1]),verbose=0)
        delta = reward + model.gamma * V1 - V0

        # fit critic
        model.critic.fit(np.array([observation0]), [reward + model.gamma * V1], batch_size=1, verbose=0)

        if delta > 0:
            # if delta is positive, fit actor
            model.actor.fit(np.array([observation0]), [a0], batch_size=1, verbose=0)
            observation0 = observation1
        if done or truncated: obs,_ = train_env.reset()
        if it % (100*(1<<count)) == 0:
            #if it % step_eval == 0:
            count += 1
            perf = test(eval_env, model, 3 , noise_std)
            scores.append((it, perf))
            print(f'{it = } | reward {perf}')

cacla = Cacla_Keras(
    input_dim = 4,
    output_dim=1,
    alpha=0.01,
    beta=0.01,
    gamma=0.9,
    exploration_factor=0.3,
    
)
train(train_env, eval_env, cacla)

it = 0 | reward 65.33333333333333
it = 200 | reward 33.0
it = 400 | reward 12.666666666666666
it = 800 | reward 10.666666666666666
it = 1600 | reward 42.333333333333336
it = 3200 | reward 27.666666666666668
it = 6400 | reward 20.0
it = 12800 | reward 27.0
