In [14]:
# chose the current file directory as the working directory
import os 
os.chdir("/teamspace/studios/this_studio/rubikscubesolver")


In [15]:
from tqdm import tqdm

import wandb  # for logging
import time
from dataclasses import dataclass

import jax
import jax.numpy as jnp
import flax.nnx as nnx

import optax

from rubiktransformer.models import RubikTransformer, PolicyModel
import rubiktransformer.dataset as dataset
from rubiktransformer.trainer import train
from rubiktransformer.trainer import reshape_sample

In [16]:
@dataclass
class Config:
    """Configuration class"""
    jax_key: jnp.ndarray = jax.random.PRNGKey(45)
    rngs = nnx.Rngs(44)
    batch_size: int = 1024
    lr_1: float = 4e-3
    lr_2: float = 4e-3
    nb_games: int = 1024 * 100
    len_seq: int = 12
    nb_step: int = 1000000
    log_every_step: int = 10
    log_eval_every_step: int = 10
    log_policy_reward_every_step: int = 10
    add_data_every_step: int = 500

config = Config()

# init wandb config
user = "forbu14"
project = "RubikTransformer"
display_name = "experiment_" + time.strftime("%Y%m%d-%H%M%S")

wandb.init(entity=user, project=project, name=display_name)


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
sum_reward_policy,█▆▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
sum_reward_policy,-0.23289


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112376177773815, max=1.0…

In [17]:
env, buffer = dataset.init_env_buffer(sample_batch_size=config.batch_size)

policy = PolicyModel(rngs=config.rngs, d_model=1024, temp=5.)
transformer = RubikTransformer(rngs=config.rngs, causal=True)

optimizer_policy = optax.chain(
    optax.clip_by_global_norm(1.0),
    optax.adamw(config.lr_1 / 10.),
)


optimizer_policy = nnx.Optimizer(policy, optimizer_policy)

# metrics
metrics_train = nnx.MultiMetric(
    loss=nnx.metrics.Average("loss"),
    loss_reward=nnx.metrics.Average("loss_reward"),
    loss_cross_entropy=nnx.metrics.Average("loss_cross_entropy"),
)

metrics_eval = nnx.MultiMetric(
    loss_eval=nnx.metrics.Average("loss_eval"),
    loss_reward_eval=nnx.metrics.Average("loss_reward_eval"),
    loss_cross_entropy_eval=nnx.metrics.Average("loss_cross_entropy_eval"),
)


metrics_policy = nnx.MultiMetric(
    sum_reward_policy=nnx.metrics.Average("sum_reward_policy"),
)


In [18]:
# load weight from world model transformer:
import pickle

filename = "statev6.pickle"

with open(filename, "rb") as input_file:
    state = pickle.load(input_file)

nnx.update(transformer, state)

In [19]:
state

State({
  'action_mapping': {
    'bias': VariableState(
      type=Param,
      value=Array([ 2.98494305e-02,  7.37091200e-03,  1.95764122e-03, -7.42887263e-04,
              4.47213883e-03, -1.97569397e-03, -1.05965240e-02,  2.91437353e-03,
             -2.21973169e-04, -9.10157617e-03,  2.28590495e-03, -1.90868392e-03,
              1.00215327e-03, -1.14400042e-02, -3.59990627e-05, -5.88784646e-03,
             -6.00204896e-03, -1.50419520e-02,  6.35023054e-04,  2.19244440e-03,
              9.63746384e-03,  6.40006363e-03,  9.55769233e-03, -1.84854679e-02,
             -1.29062552e-02,  7.26002501e-03,  4.44780141e-02, -6.89176784e-04,
             -1.76944733e-02,  5.26020071e-03,  1.60558335e-02, -1.72243211e-02,
              1.93193310e-03, -4.42172680e-03, -8.73702206e-03, -2.39823805e-03,
              1.03886090e-02,  3.90930893e-03, -1.02841277e-02,  9.13410354e-03,
              1.48430187e-03,  3.35732801e-03, -2.83761718e-03, -9.74433613e-04,
             -1.45434914e-02

In [20]:


nb_games = config.nb_games
len_seq = config.len_seq

vmap_reset = jax.vmap(jax.jit(env.reset))


In [21]:
def gather_data_policy(
    model_policy: PolicyModel,
    model_worldmodel: RubikTransformer,
    env,
    vmap_reset,
    batch_size,
    len_seq,
    key,):
    keys = jax.random.split(key, batch_size)
    state, timestep = vmap_reset(keys)

    one_hot = jax.nn.one_hot(state.cube, 6)
    state_first_policy = jnp.reshape(
        one_hot, (batch_size, 1, -1)
    )

    state_pred = jnp.copy(state_first_policy)
    action_list = None

    state_pred_list = []
    uniform0_list = []
    uniform1_list = []

    # Collect a batch of rollouts
    for i in range(len_seq):
        keys = jax.random.split(key, batch_size)
        key_uniform = jax.random.split(keys[0], 2)
        key = keys[1]
        
        # generate random values 
        # random_uniform0, random_uniform1
        # should be of size (batch_size, 6) and (batch_size, 3) 
        uniform0 = jax.random.uniform(key_uniform[0], (batch_size, 1, 6))
        uniform1 = jax.random.uniform(key_uniform[1], (batch_size, 1, 3))

        # apply the policy
        action_result = model_policy(state_pred, uniform0, uniform1)

        if action_list is None:
            action_list = action_result
        else:
            action_list = jnp.concatenate((action_list, action_result), axis=1)

        # save data into a list
        state_pred_list.append(state_pred)
        uniform0_list.append(uniform0)
        uniform1_list.append(uniform1)

        # now we can apply the world model to sample next state
        state_logits, reward = model_worldmodel(state_pred, action_list)

        # reshape then argmax
        state_logits = state_logits.reshape(
            (state_logits.shape[0], state_logits.shape[1], 54, 6)
        )

        state_pred = jnp.argmax(state_logits, axis=3)

        # onehot
        state_pred = jax.nn.one_hot(state_pred, 6)

        # shape to flatten
        state_pred = state_pred.reshape((state_pred.shape[0], state_pred.shape[1], -1))

        # take the last state
        state_pred = state_pred[:, -1, :]

        # add a dimension on axis 1
        state_pred = jnp.expand_dims(state_pred, axis=1)

    # here we create the dataset in a proper format
    state_pred_histo = jnp.concatenate(state_pred_list, axis=1)
    uniform0_histo = jnp.concatenate(uniform0_list, axis=1)
    uniform1_histo = jnp.concatenate(uniform1_list, axis=1)

    return state_pred_histo, uniform0_histo, uniform1_histo, action_list


key = jax.random.PRNGKey(48)

state_pred_histo, uniform0_histo, uniform1_histo, action_list = gather_data_policy(
    policy,
    transformer,
    env,
    vmap_reset,
    config.batch_size,
    config.len_seq,
    key,)


In [22]:
policy.training

True

In [23]:
state_pred_histo, uniform0_histo, uniform1_histo, action_list = gather_data_policy(
    policy,
    transformer,
    env,
    vmap_reset,
    config.batch_size,
    config.len_seq,
    key,)

In [24]:
nnx.display(transformer)

In [25]:

def reward_hacking(reward):
    """
    reward is an array of value of shape (batch_size, len_seq, 1) with value between -1 and 1
    we want to apply to every element the funciton
    f(x) = 0.1 * jnp.exp(4 * x)
    """

    return 0.1 * jnp.exp(4. * reward)

def loss_fn_transformer_policy(model_policy: PolicyModel, model: RubikTransformer, batch):
    action_plan = model_policy(batch["states"], batch["uniform0"], batch["uniform1"])

    states_next, reward_value = model(batch["state_first"], action_plan) 

    # modify the reward learning dynamics (end goal is very important)
    reward_value = reward_hacking(reward_value)

    loss_reward = - (reward_value[:, 1:, :]).sum(axis=1).mean()

    loss = loss_reward

    return loss, (loss_reward)

@nnx.jit
def train_step_transformer_policy(
    model_policy: PolicyModel, model: RubikTransformer, optimizer: nnx.Optimizer, metrics: nnx.MultiMetric, batch
):
    """Train for a single step."""
    grad_fn = nnx.value_and_grad(loss_fn_transformer_policy, has_aux=True)
    (loss, (loss_reward)), grads = grad_fn(model_policy, model, batch)
    metrics.update(
        sum_reward_policy=loss
    )
    optimizer.update(grads)


In [26]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

# transformer model calibration
for idx_step in tqdm(range(15000)):
    # gather data from policy :
    key, subkey = jax.random.split(config.jax_key)
    config.jax_key = key
    
    state_pred_histo, uniform0_histo, uniform1_histo, action_list = gather_data_policy(
        policy,
        transformer,
        env,
        vmap_reset,
        config.batch_size,
        config.len_seq,
        config.jax_key)

    batch = {
        "states": state_pred_histo,
        "uniform0": uniform0_histo,
        "uniform1": uniform1_histo,
        "state_first": state_pred_histo[:, 0, :],
    }

    batch["state_first"] = jnp.expand_dims(batch["state_first"], axis=1)

    train_step_transformer_policy(
        policy,
        transformer,
        optimizer_policy,
        metrics_policy,
        batch
    )

    if idx_step % config.log_policy_reward_every_step == 0:
        result_metrics = metrics_policy.compute()

        wandb.log(result_metrics, step=idx_step)

        metrics_policy.reset()



 18%|█▊        | 2655/15000 [51:17<3:58:30,  1.16s/it]


KeyboardInterrupt: 

In [27]:
transformer.train()

action_plan = policy(batch["states"], batch["uniform0"], batch["uniform1"])
states_next, reward_value = transformer(batch["state_first"], action_plan) 

In [29]:
print(action_plan[0, 0, :])

[1.8593616e-07 6.2573171e-01 2.0037910e-04 7.8923050e-03 5.7217587e-08
 3.6617535e-01 9.9623764e-01 3.7623902e-03 3.7600452e-09]


In [44]:
reward_value[13, 1:, :]

Array([[-0.4022617 ],
       [-0.42012608],
       [-0.4109369 ],
       [-0.35496473],
       [-0.42604265],
       [-0.4018938 ],
       [-0.38917527],
       [-0.36491984],
       [-0.33325246],
       [-0.32490718],
       [-0.321507  ],
       [-0.33208442]], dtype=float32)

In [19]:
action_plan[0, 0]

Array([1.5827626e-09, 3.5747697e-05, 3.9419392e-06, 9.9820602e-07,
       9.9995935e-01, 2.2563261e-10, 7.2900742e-01, 2.7099249e-01,
       5.1224045e-08], dtype=float32)

In [75]:
reward_hacking(reward_value[41, 1:, :])

Array([[0.02246363],
       [0.02130145],
       [0.0240391 ],
       [0.03464752],
       [0.03399757],
       [0.04170119],
       [0.04662441],
       [0.05167986],
       [0.05111608],
       [0.05709394],
       [0.0562774 ],
       [0.053836  ]], dtype=float32)

In [76]:
init_result = jnp.argmax(batch["state_first"][41, 0, :].reshape(54, 6), axis=1).reshape(6, 3, 3)

reward = jnp.where(init_result != dataset.GOAL_OBSERVATION, -1.0, 1.0)


reward_hacking(reward.mean())

Array(0.01960022, dtype=float32)

In [50]:
reward_hacking(reward.mean())

Array(0.02273007, dtype=float32)

In [77]:
jnp.argmax(batch["state_first"][41, 0, :].reshape(54, 6), axis=1).reshape((6, 3, 3))


Array([[[0, 5, 3],
        [1, 0, 3],
        [0, 0, 5]],

       [[1, 3, 2],
        [2, 1, 0],
        [2, 3, 1]],

       [[1, 5, 4],
        [2, 2, 0],
        [5, 1, 2]],

       [[0, 1, 2],
        [4, 3, 1],
        [1, 4, 4]],

       [[3, 2, 4],
        [0, 4, 5],
        [5, 4, 5]],

       [[3, 2, 4],
        [3, 5, 4],
        [3, 5, 0]]], dtype=int32)

In [78]:
jnp.argmax(states_next[41, -1, :].reshape(54, 6), axis=1).reshape((6, 3, 3))

Array([[[0, 0, 5],
        [1, 0, 2],
        [4, 2, 0]],

       [[0, 1, 2],
        [1, 1, 3],
        [0, 1, 1]],

       [[3, 5, 5],
        [2, 2, 1],
        [1, 1, 2]],

       [[1, 4, 3],
        [2, 3, 3],
        [3, 3, 1]],

       [[4, 0, 3],
        [5, 4, 2],
        [2, 5, 4]],

       [[1, 2, 4],
        [2, 5, 4],
        [5, 5, 5]]], dtype=int32)

In [142]:
reward_hacking(-0.4444)

Array(0.01690434, dtype=float32, weak_type=True)

In [106]:
jax.nn.softmax(states_next[0, 1, :].reshape((54, 6)))[1, :]

Array([3.9461483e-08, 1.9702059e-06, 1.0005269e-04, 9.9987459e-01,
       2.8885726e-12, 2.3303615e-05], dtype=float32)

In [85]:
transformer.transformer

List(
  0=TransformerBlock(
    causal=True,
    dropout=Dropout(rate=0.05, broadcast_dims=(), deterministic=False, rng_collection='dropout', rngs=Rngs(
      default=RngStream(
        count=RngCount(
          tag='default',
          value=Array(786031, dtype=uint32)
        ),
        key=RngKey(
          tag='default',
          value=Array((), dtype=key<fry>) overlaying:
          [ 0 45]
        )
      )
    )),
    feedforward=FeedForward(
      linear1=Linear(
        bias=Param(
          value=Array(shape=(1024,), dtype=float32)
        ),
        bias_init=<function zeros at 0x7f7ef8f0b7f0>,
        dot_general=<function dot_general at 0x7f7ef9447910>,
        dtype=None,
        in_features=512,
        kernel=Param(
          value=Array(shape=(512, 1024), dtype=float32)
        ),
        kernel_init=<function variance_scaling.<locals>.init at 0x7f7ef874c040>,
        out_features=1024,
        param_dtype=<class 'jax.numpy.float32'>,
        precision=None,
        us