In [1]:
# chose the current file directory as the working directory
import os
os.chdir("/teamspace/studios/this_studio/rubikscubesolver")

In [2]:
from tqdm import tqdm
import pickle

import wandb  # for logging
import time
from dataclasses import dataclass

import jax
import jax.numpy as jnp
import flax.nnx as nnx

import rubiktransformer.dataset as dataset
from rubiktransformer.trainer import reshape_sample

from rubiktransformer.trainer_online import init_model_optimizer, init_buffer, train_step_transformer_rf, training_loop
from rubiktransformer.online_training_utils import run_n_steps, reshape_diffusion_setup

cuda_plugin_extension is not found.


In [3]:
@dataclass
class Config:
    """Configuration class"""

    jax_key: jnp.ndarray = jax.random.PRNGKey(49)
    rngs = nnx.Rngs(48)
    batch_size: int = 128
    lr_1: float = 4e-4
    lr_2: float = 4e-4
    nb_games: int = 128 * 100
    len_seq: int = 32
    nb_step: int = 1000000
    max_length_buffer: int = 1024 * 10
    log_every_step: int = 10
    log_eval_every_step: int = 10
    log_policy_reward_every_step: int = 10
    add_data_every_step: int = 500

    save_model_every_step: int = 2000


config = Config()

# init wandb config
user = "forbu14"
project = "RubikTransformer"
display_name = "experiment_" + time.strftime("%Y%m%d-%H%M%S")

wandb.init(entity=user, project=project, name=display_name)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mforbu14[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:

(
    optimizer_diffuser,
    optimizer_inverse,
    metrics_train,
    metrics_eval,
    metrics_inverse,
    transformer,
    inverse_rl_model,
) = init_model_optimizer(config)

env, buffer, buffer_eval, buffer_list, buffer_list_eval, jit_step = init_buffer(
    config
)

vmap_reset = jax.vmap(jax.jit(env.reset))
vmap_step = jax.vmap(run_n_steps, in_axes=(0, 0, None))

##### TRAINING #####
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key


In [5]:

buffer, buffer_list = dataset.fast_gathering_data_diffusion(
    env,
    vmap_reset,
    vmap_step,
    config.nb_games * 1,  # old is int(config.nb_games * 10.0),
    config.len_seq,
    buffer,
    buffer_list,
    subkey,
)


In [6]:
# load weight from world model transformer:
import pickle

filename = "state_ddt_model_improved_v2.pickle"

with open(filename, "rb") as input_file:
    state = pickle.load(input_file)

nnx.update(transformer, state)

# load weight from world model transformer:
import pickle

filename = "state_inverse_rl_model_improved_v2.pickle"

with open(filename, "rb") as input_file:
    state = pickle.load(input_file)

nnx.update(inverse_rl_model, state)

In [7]:
sample = buffer.sample(buffer_list, subkey)
sample = reshape_diffusion_setup(sample)


In [8]:
sample.keys()

dict_keys(['action', 'reward', 'state_histo', 'time_step', 'context', 'state_past', 'state_future', 'state_future_noise', 'action_inverse', 'state_histo_inverse_t', 'state_histo_inverse_td1'])

In [9]:
sample["action_inverse"]

Array([[0., 0., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]], dtype=float32)

In [10]:
sample["state_histo_inverse_t"]

Array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
sample["state_histo_inverse_td1"]

Array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
inverse_rl_model(sample["state_histo_inverse_t"], sample["state_histo_inverse_td1"])[:, :6]

Array([[-58.14122  , -12.908836 , 107.829445 , -26.21095  , -87.36493  ,
        -19.743734 ],
       [-17.515263 ,  80.81346  , -28.577995 , -49.040886 , -30.555523 ,
        -17.010876 ],
       [-30.758236 , -30.516747 ,  76.49304  , -14.1797085, -48.68231  ,
        -10.990468 ],
       ...,
       [-23.25964  , -62.139297 , -24.637434 ,  67.35371  , -12.765332 ,
        -13.466367 ],
       [-33.95061  , -17.303339 ,  73.70761  , -25.77107  , -45.953106 ,
        -11.735286 ],
       [-15.208434 ,  78.16751  , -26.444832 , -57.60352  , -29.599823 ,
        -18.02725  ]], dtype=float32)

In [13]:

def sampling_model(key, model, sample_eval, nb_step=100, config=None, target_reward=0.5):
    """
    Function used to sampling a state from a list 
    """
    seq_len_future = config.len_seq - config.len_seq // 4 
    noise_future  = jax.random.dirichlet(key, jnp.ones(6) * 5., (config.batch_size, seq_len_future, 54))
    sample_eval["reward"] = jnp.linspace(start=target_reward, stop=0.1 + target_reward, num=config.batch_size)[:, None]

    for t_step in range(nb_step):
        t_step_array = jnp.ones((config.batch_size, 1, 1, 1)) * float(t_step / nb_step)
        sample_eval["context"] = jnp.concatenate([sample_eval["reward"], t_step_array[:, :, 0, 0]], axis=1)

        estimation_logits_past, estimation_logits_future = model(
            sample_eval["state_past"], noise_future, sample_eval["context"]
        )

        estimation_proba_future = jax.nn.softmax(estimation_logits_future, axis=-1)

        noise_future = noise_future + float(1. / nb_step) * 1./ (1. - t_step_array + 0.0001) * (estimation_proba_future - noise_future)

    return noise_future



In [14]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

buffer_eval, buffer_list_eval = dataset.fast_gathering_data_diffusion(
    env,
    vmap_reset,
    vmap_step,
    int(config.batch_size),
    config.len_seq,
    buffer_eval,
    buffer_list_eval,
    subkey,
)

sample = buffer_eval.sample(buffer_list_eval, subkey)
sample = reshape_diffusion_setup(sample, subkey)

In [15]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

sample = buffer.sample(buffer_list, subkey)
sample = reshape_diffusion_setup(sample, subkey)


result = sampling_model(key=config.jax_key, model=transformer, sample_eval=sample, config=config, nb_step=100)
result

Array([[[[1.19809993e-05, 3.65849119e-05, 9.99903500e-01,
          1.72968721e-05, 1.96222682e-05, 1.11013651e-05],
         [1.23428181e-05, 1.35994051e-05, 9.99916852e-01,
          1.42016215e-05, 2.73603946e-05, 1.56546012e-05],
         [1.42597128e-05, 1.47442333e-05, 1.83022348e-05,
          9.99920487e-01, 1.46263046e-05, 1.76462345e-05],
         ...,
         [9.99912381e-01, 1.26255909e-05, 1.34599395e-05,
          1.71344727e-05, 1.53474975e-05, 2.90414318e-05],
         [1.77901238e-05, 1.84202800e-05, 1.11357076e-05,
          9.99912977e-01, 2.40276568e-05, 1.56586757e-05],
         [9.99908090e-01, 2.08260026e-05, 3.67206521e-05,
          1.23797217e-05, 9.59716272e-06, 1.24993967e-05]],

        [[9.71351983e-06, 2.58991495e-05, 9.99895155e-01,
          1.53561123e-05, 3.16093210e-05, 2.23021489e-05],
         [8.55122926e-06, 3.54822259e-06, 9.99925613e-01,
          3.59665137e-05, 1.46027887e-05, 1.17224408e-05],
         [1.04872743e-05, 3.10603064e-05, 2.0315

In [16]:
index_batch  = 64

jnp.argmax(sample["state_past"], axis=-1).reshape((128, 8, 6, 3, 3))[index_batch, -1, :, :, :]

Array([[[4, 3, 1],
        [4, 0, 2],
        [3, 5, 3]],

       [[2, 1, 5],
        [5, 1, 3],
        [0, 4, 5]],

       [[4, 5, 4],
        [2, 2, 0],
        [2, 1, 4]],

       [[5, 5, 1],
        [2, 3, 3],
        [3, 2, 1]],

       [[0, 1, 0],
        [0, 4, 4],
        [2, 3, 1]],

       [[2, 0, 3],
        [4, 5, 0],
        [5, 1, 0]]], dtype=int32)

In [17]:
jnp.argmax(result, axis=-1).reshape((128, 24, 6, 3, 3))[index_batch, 0, :, :, :]

Array([[[3, 4, 4],
        [5, 0, 3],
        [3, 2, 1]],

       [[4, 5, 4],
        [5, 1, 3],
        [0, 4, 5]],

       [[5, 5, 1],
        [2, 2, 0],
        [2, 1, 4]],

       [[0, 1, 0],
        [2, 3, 3],
        [3, 2, 1]],

       [[2, 1, 5],
        [0, 4, 4],
        [2, 3, 1]],

       [[2, 0, 3],
        [4, 5, 0],
        [5, 1, 0]]], dtype=int32)

In [18]:
jnp.argmax(result, axis=-1).reshape((128, 24, 6, 3, 3))[index_batch, 1, :, :, :]

Array([[[4, 3, 3],
        [4, 0, 2],
        [3, 5, 5]],

       [[2, 1, 1],
        [5, 1, 2],
        [0, 4, 0]],

       [[4, 0, 4],
        [5, 2, 1],
        [4, 2, 2]],

       [[0, 5, 1],
        [0, 3, 3],
        [3, 2, 1]],

       [[0, 1, 0],
        [0, 4, 4],
        [2, 3, 1]],

       [[2, 0, 3],
        [4, 5, 3],
        [5, 1, 5]]], dtype=int32)

In [19]:
def generate_past_state_with_with_random_policy(key, vmap_reset, step_jit_env, config):
    """
    Generate past state with random policy

    Args:
        config: configuration object

    Returns:
        state_past: (batch_size, len_seq//4, 6, 3, 3)

    """

    key1, key2 = jax.random.split(config.jax_key)

    keys = jax.random.split(key1, config.batch_size)
    state, timestep = vmap_reset(keys)

    last_state = None
    past_state = []

    actions_all = jax.random.randint(
        key=config.jax_key,
        minval=env.action_spec.minimum,
        maxval=env.action_spec.maximum,
        shape=(config.batch_size, config.len_seq // 4, 3),
    )

    for i in range(config.len_seq // 4):

        # apply random policy and retrieve state
        action = actions_all[:, i, :]

        state, timestep  = step_jit_env(state, action)
        past_state.append(state.cube)

    # concat all the past state to get the shape (batch_size, len_seq//4, 6, 3, 3) from a list of state of size (batch_size, 6, 3, 3) by creating the 1 axis
    state_past = jnp.stack(past_state, axis=1)

    return state_past, state, actions_all

step_jit_env = jax.vmap(jit_step)

state_past, state, actions_past = generate_past_state_with_with_random_policy(key, vmap_reset, step_jit_env, config)

In [20]:
state_past.shape

(128, 8, 6, 3, 3)

In [21]:

def apply_decision_diffuser_policy(key, state_past, decision_diffuser, inverse_rl_model, config, target_reward=0.5):
    """
    1. Make a estimation of the targeted reward
    2. Generate futur state with those targeted reward
    3. Choose policy from that
    """
    sample_eval = {
        "state_past": jax.nn.one_hot(state_past, 6),
    }

    state_past = jnp.copy(state_past.reshape((state_past.shape[0], state_past.shape[1], -1)))
    state_past = jax.nn.one_hot(state_past, num_classes=6)

    state_future = sampling_model(key, decision_diffuser, sample_eval, nb_step=100, config=config, target_reward=target_reward)

    # state_future is (batch_size, seq_len, dim_input_state / 6, 6)
    state_to_act = jnp.concatenate([state_past, state_future], axis=1)
    state_to_act_futur_t = state_to_act[:, (config.len_seq // 4 - 1):(-1), :, :]
    state_to_act_futur_td1 = state_to_act[:, (config.len_seq // 4):, :, :]

    # flatten the last 2 axis
    state_to_act_futur_t = state_to_act_futur_t.reshape(
        (state_to_act_futur_t.shape[0], state_to_act_futur_t.shape[1], -1)
    )

    state_to_act_futur_td1 = state_to_act_futur_td1.reshape(
        (state_to_act_futur_td1.shape[0], state_to_act_futur_td1.shape[1], -1)
    )

    # now use reverse RL to compute the action TODO later
    actions = inverse_rl_model(state_to_act_futur_t, state_to_act_futur_td1)

    return actions

actions_futur = apply_decision_diffuser_policy(config.jax_key, state_past, transformer, inverse_rl_model, config)

In [22]:

from rubiktransformer.dataset import GOAL_OBSERVATION

def gather_data_with_policy(state, state_past, actions_past, actions_futur, buffer, buffer_list, config):
    """
    For loop with those policy and state

    log performance compare to target

    """
    state_futur_list = []

    for i in range(config.len_seq - config.len_seq // 4):
        actions_step = actions_futur[:, i, :]
        actions_0 = jnp.argmax(actions_step[:, :6], axis=1)
        actions_1 = jnp.argmax(actions_step[:, 6:], axis=1)

        actions_full = jnp.stack([actions_0, jnp.zeros(config.batch_size), actions_1], axis=1)
        
        # transform to int type
        actions_full = actions_full.astype(jnp.int32)
    
        # step 
        state, timestep  = step_jit_env(state, actions_full)

        state_futur_list.append(state.cube)

    # TODO SAVE DATA into batch format for later training
    actions_0_all_futur = jnp.argmax(actions_futur[:, :, :6], axis=-1)
    actions_1_all_futur = jnp.argmax(actions_futur[:, :, 6:], axis=-1)

    action_all_futur = jnp.stack([actions_0_all_futur, jnp.zeros((config.batch_size, actions_0_all_futur.shape[1])), actions_1_all_futur], axis=-1)

    action_all = jnp.concatenate([actions_past, action_all_futur], axis=1)
    action_all = action_all.astype(jnp.int32)

    state_futur = jnp.stack(state_futur_list, axis=1)

    state_all = jnp.concatenate([state_past, state_futur], axis=1)

    # compute reward 
    goal_observation = jnp.repeat(
        GOAL_OBSERVATION[None, None, :, :, :], config.batch_size, axis=0
    )
    goal_observation = jnp.repeat(goal_observation, config.len_seq, axis=1)
    reward = jnp.where(state_all != goal_observation, -1.0, 1.0)

    reward = reward.mean(axis=[2, 3, 4])
    reward = reward[:, -1] - reward[:, config.len_seq//4]
    reward_whole = reward.max(axis=-1)


    for idx_batch in range(config.batch_size):
        buffer_list = buffer.add(
            buffer_list,
            {
                "action": action_all[idx_batch],
                "reward": reward[idx_batch],
                "state_histo": state_all[idx_batch],
            },
        )

    return buffer, buffer_list, reward, reward_whole

buffer, buffer_list, reward_real, reward_whole = gather_data_with_policy(state, state_past, actions_past, actions_futur, buffer, buffer_list, config)


In [23]:
def reward_hacking(reward):
    """
    reward is an array of value of shape (batch_size, len_seq, 1) with value between -1 and 1
    we want to apply to every element the funciton
    f(x) = 0.1 * jnp.exp(4 * x)
    """

    return 0.1 * jnp.exp(4.0 * reward)

def improve_training_loop(buffer, buffer_list, nb_iter=10000):
    """
    Relaunch the training loop with those new data incorporated into the buffer
    
    Full stuff here
    Online transformer setup

    1. We generate env setup 
    2. First random action in the different env
    3. Use decision_diffuser to choose the action to do from here
    4. Observe / apply policy  to retrieve data
    5. Add the data into the buffer
    6. Train model on those data

    Remember to log the performance data to compare with other run / algorithms
    """
    target_reward = 0.5
    
    for idx_step in range(nb_iter):

        print("begin iter")

        key, subkey = jax.random.split(config.jax_key)
        config.jax_key = key

        print("generate high reward value")
        # first generate random state
        state_past, state, actions_past = generate_past_state_with_with_random_policy(key, vmap_reset, step_jit_env, config)
        
        print("apply the strategy")
        # apply model to get some generation
        actions_futur = apply_decision_diffuser_policy(config.jax_key, state_past, transformer, inverse_rl_model, config, target_reward)

        print("improve data buffer")
        # update replay buffer dataset
        buffer, buffer_list, reward_mean, reward_whole = gather_data_with_policy(state, state_past, actions_past, actions_futur, buffer, buffer_list, config)

        diff_target = reward_mean - target_reward
        target_reward = 1./2. * (target_reward + reward_mean.mean())
        print("new target : ", target_reward)

        wandb.log({"reward_normalized" : reward_hacking(reward_whole).mean(), "target_reward_new" : target_reward, "diff_target_reward": diff_target.mean()}, step=idx_step)

        # now we can do the training loop
        sample = buffer.sample(buffer_list, subkey)
        sample = reshape_diffusion_setup(sample, subkey)

        print("trainign")

        # we update the policy
        train_step_transformer_rf(
            transformer, optimizer_diffuser, metrics_train, sample
        )

        if idx_step % config.log_every_step == 0:
            metrics_train_result = metrics_train.compute()
            print(metrics_train_result)

            wandb.log(metrics_train_result, step=idx_step)
            metrics_train.reset()

improve_training_loop(buffer, buffer_list, nb_iter=10000)


begin iter
generate high reward value
apply the strategy


improve data buffer
new target :  0.27734375
trainign
{'loss': Array(0.22688669, dtype=float32), 'loss_cross_entropy': Array(0.21618629, dtype=float32)}
begin iter
generate high reward value
apply the strategy
improve data buffer
new target :  0.18106192
trainign
begin iter
generate high reward value
apply the strategy
improve data buffer
new target :  0.12525319
trainign
begin iter
generate high reward value
apply the strategy
improve data buffer
new target :  0.09677011
trainign
begin iter
generate high reward value
apply the strategy
improve data buffer
new target :  0.06762695
trainign
begin iter
generate high reward value
apply the strategy
improve data buffer
new target :  0.053778753
trainign
begin iter
generate high reward value
apply the strategy
improve data buffer
new target :  0.04034424
trainign
begin iter
generate high reward value
apply the strategy
improve data buffer
new target :  0.034061007
trainign
begin iter
generate high reward value
apply the strategy
improve dat