### Install initial environment in Google Colab

In [1]:
import sys
import os

if 'google.colab' in sys.modules:
  if not os.path.exists('/content/.already_installed'):
    !git clone https://github.com/FlutterbaseDotCom/hdt

    !apt-get install -y swig
    !pip install box2d-py
    !pip install 'gymnasium[box2d]'
    !pip install 'stable-baselines3[extra]'
    !pip install toml
    !pip install wandb
    !pip install datasets
    !pip install transformers
    !pip install torchviz
    !pip install accelerate -U

    with open('/content/.already_installed', 'w') as f:
        f.write('done')
  %cd /content/hdt


Cloning into 'hdt'...
remote: Enumerating objects: 104, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 104 (delta 24), reused 19 (delta 8), pack-reused 64[K
Receiving objects: 100% (104/104), 47.57 MiB | 11.49 MiB/s, done.
Resolving deltas: 100% (52/52), done.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 19 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetc

### Imports

In [2]:
import os
import random
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torchviz
import wandb
from datasets import Dataset, load_dataset
from stable_baselines3 import DQN
from stable_baselines3.common.torch_layers import NatureCNN
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from torch.utils.data import Subset
from transformers import Trainer, TrainingArguments

from dt.configuration_decision_transformer import DecisionTransformerConfig
from dt.modeling_decision_transformer import DecisionTransformerModel
from extract_cnn import prepare_observation_array
from dt.trainable_dt import DecisionTransformerGymDataCollator, TrainableDT





  from tensorflow.tsl.python.lib.core import pywrap_ml_dtypes


### Generate Data

In [3]:
NUM_EPISODES = 16
MAX_EPISODE_STEPS = 1000

  and should_run_async(code)


In [4]:
env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'
model = DQN
tmp_model_path ='./models/dql_pretrained/dql_rl_11.zip'
loaded_model = model.load(tmp_model_path)

features = {
    "observations": [],
    "actions": [],
    "rewards": [],
    "dones": [],
}
for episode in range(NUM_EPISODES):
    print(f"Episode: {episode} of {NUM_EPISODES}:" )
    [obs, _] = env.reset()
    done = False

    o, a, r, d = [], [], [], []
    total_reward = 0
    sti = 0
    while not done:
        sti = sti + 1
        if sti > MAX_EPISODE_STEPS:
            break

        # if random.random() < epsilon:
        #     action = 3# env.action_space.sample()
        # else:
        action, _states = loaded_model.predict(obs,deterministic=True)
        new_obs, reward, done, t, i = env.step(action)
        total_reward = total_reward + reward
        oarr = prepare_observation_array(obs)
        o.append(oarr.flatten())
        a.append(action.item())
        r.append(reward)
        d.append(done)
        obs = new_obs
        print(".", end="")

        # check if last 50 steps does not contain a single positive reward
        if len(r) > 100 and max(r[-50:]) <= 0:
            # cut last 50 and set done to True
            r = r[:-50]
            d = d[:-50]
            a = a[:-50]
            d[-1] = True
            print('\nstopping due to the last 50 steps not negative rewards')
            break
    print(f"\nTotal reward: {total_reward} episodes steps: {len(o)}")

    features["observations"].append(o)
    features["actions"].append(a)
    features["rewards"].append(r)
    features["dones"].append(d)

env.close()
print(len(features["actions"]))






Episode: 0 of 16:
......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

### Persist Dataset

In [5]:
dataset = Dataset.from_dict(features)
#dataset.save_to_disk('datasets/car_racing_0070_0040/')

# dataset_size = len(dataset)
# split_point = int(0.9 * dataset_size)
# #dataset is already shuffled
# train_dataset = Subset(dataset, range(split_point))
# val_dataset = Subset(dataset, range(split_point, dataset_size))

#from datasets import load_from_disk
#dataset = load_from_disk('datasets/car_racing_002/')


### Split dataset

In [6]:
dataset = {
    'train': dataset,
   # 'validation': val_dataset
}


In [7]:
import toml
# TOML-formatted string
config_toml = """
PREFIX              = 'DT'
LOG_INTERVAL        = 1
save_steps          = 1
num_train_epochs    = 4
per_device_train_batch_size=16
learning_rate       = 0.0001
weight_decay        = 0.0001
warmup_ratio        = 0.1
max_grad_norm       = 0.25
"""

config_toml = toml.loads(config_toml)

LOAD_SAVED_MODEL    = False

RUN_NUM = 20
WANDB_ID            = "dt_"+str(RUN_NUM)
WNDB_NAME           = "DT_"+str(RUN_NUM)
MODEL_SAVE_NAME     = WNDB_NAME
SAVED_MODEL_VERSION = "latest"

os.environ["WANDB_DISABLED"] = "false"
os.environ['WANDB_NOTEBOOK_NAME'] = 'DT.ipynb'
os.environ['WANDB_MODE']='online'
os.environ["WANDB_LOG_MODEL"] = "checkpoint"


wandb.login(key="f060d3284088ffaf4624e2de8b236f39711a99a2")
wandb.init(resume=WANDB_ID,
           name = WNDB_NAME,
           mode="online",
           entity="yakiv",
            project="CarRacingDT",
            #resume= "allow"
            config=config_toml
           )


env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myakiv[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Train

In [8]:


collator = DecisionTransformerGymDataCollator(dataset["train"])

config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim)
print(config.to_dict())

model = TrainableDT(config)


training_args = TrainingArguments(
    output_dir="output/",
    report_to="wandb",
    save_steps=config_toml["save_steps"],
    remove_unused_columns=False,
    optim="adamw_torch",
    num_train_epochs=config_toml["num_train_epochs"],
    per_device_train_batch_size=config_toml["per_device_train_batch_size"],
    learning_rate=config_toml["learning_rate"],
    weight_decay=config_toml["weight_decay"],
    warmup_ratio=config_toml["warmup_ratio"],
    max_grad_norm=config_toml["max_grad_norm"],
    logging_steps=config_toml["LOG_INTERVAL"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=collator,

)

trainer.train()


  and should_run_async(code)


{'action_num': 5, 'state_dim': 27648, 'act_dim': 1, 'hidden_size': 128, 'max_ep_len': 4096, 'action_tanh': True, 'vocab_size': 1, 'n_positions': 1024, 'n_layer': 3, 'n_head': 1, 'n_inner': None, 'activation_function': 'relu', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diver

Step,Training Loss
1,1.6678


[34m[1mwandb[0m: Adding directory to artifact (./output/checkpoint-1)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./output/checkpoint-2)... Done. 0.1s


if true


ValueError: ignored