### Install initial environment in Google Colab

In [10]:
import sys
import os

if 'google.colab' in sys.modules:
  if not os.path.exists('/content/.already_installed'):
    !git clone https://github.com/FlutterbaseDotCom/hdt

    !apt-get install -y swig
    !pip install box2d-py
    !pip install 'gymnasium[box2d]'
    !pip install 'stable-baselines3[extra]'
    !pip install toml
    !pip install wandb
    !pip install datasets
    !pip install transformers
    !pip install torchviz
    !pip install accelerate -U

    with open('/content/.already_installed', 'w') as f:
        f.write('done')
  %cd /content/hdt


### Imports

In [11]:
import os
import random
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torchviz
import wandb
from datasets import Dataset, load_dataset
from stable_baselines3 import DQN
from stable_baselines3.common.torch_layers import NatureCNN
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from torch.utils.data import Subset
from transformers import Trainer, TrainingArguments

from dt.configuration_decision_transformer import DecisionTransformerConfig
from dt.modeling_decision_transformer import DecisionTransformerModel
from extract_cnn import prepare_observation_array
from dt.trainable_dt import DecisionTransformerGymDataCollator, TrainableDT





### Generate Data

In [12]:
NUM_EPISODES = 70
MAX_EPISODE_STEPS = 40

In [13]:
env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'
model = DQN
tmp_model_path ='./models/dql_pretrained/dql_rl_11.zip'
loaded_model = model.load(tmp_model_path)

features = {
    "observations": [],
    "actions": [],
    "rewards": [],
    "dones": [],
}
for episode in range(NUM_EPISODES):
    print(f"Episode: {episode} of {NUM_EPISODES}:" )
    [obs, _] = env.reset()
    done = False

    o, a, r, d = [], [], [], []
    total_reward = 0
    sti = 0
    while not done:
        sti = sti + 1
        if sti > MAX_EPISODE_STEPS:
            break

        # if random.random() < epsilon:
        #     action = 3# env.action_space.sample()
        # else:
        action, _states = loaded_model.predict(obs,deterministic=True)
        new_obs, reward, done, t, i = env.step(action)
        total_reward = total_reward + reward
        oarr = prepare_observation_array(obs)
        o.append(oarr.flatten())
        a.append(action.item())
        r.append(reward)
        d.append(done)
        obs = new_obs
        print(".", end="")

        # check if last 50 steps does not contain a single positive reward
        if len(r) > 100 and max(r[-50:]) <= 0:
            # cut last 50 and set done to True
            r = r[:-50]
            d = d[:-50]
            a = a[:-50]
            d[-1] = True
            print('\nstopping due to the last 50 steps not negative rewards')
            break
    print(f"\nTotal reward: {total_reward} episodes steps: {len(o)}")

    features["observations"].append(o)
    features["actions"].append(a)
    features["rewards"].append(r)
    features["dones"].append(d)

env.close()
print(len(features["actions"]))






Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>


Episode: 0 of 70:
........................................
Total reward: 12.000000000000014 episodes steps: 40
Episode: 1 of 70:
........................................
Total reward: 5.966777408637888 episodes steps: 40
Episode: 2 of 70:
........................................
Total reward: 3.462686567164191 episodes steps: 40
Episode: 3 of 70:
........................................
Total reward: 8.383900928792585 episodes steps: 40
Episode: 4 of 70:
........................................
Total reward: 11.267175572519099 episodes steps: 40
Episode: 5 of 70:
........................................
Total reward: 5.55414012738855 episodes steps: 40
Episode: 6 of 70:
........................................
Total reward: 8.698412698412712 episodes steps: 40
Episode: 7 of 70:
........................................
Total reward: 9.745704467353965 episodes steps: 40
Episode: 8 of 70:
........................................
Total reward: 10.2857142857143 episodes steps: 40
Episode: 9

### Persist Dataset

In [14]:
dataset = Dataset.from_dict(features)
dataset.save_to_disk('datasets/car_racing_0070_0040/')

# dataset_size = len(dataset)
# split_point = int(0.9 * dataset_size)
# #dataset is already shuffled
# train_dataset = Subset(dataset, range(split_point))
# val_dataset = Subset(dataset, range(split_point, dataset_size))

#from datasets import load_from_disk
#dataset = load_from_disk('datasets/car_racing_002/')


Saving the dataset (1/1 shards): 100%|██████████| 70/70 [00:04<00:00, 17.21 examples/s]


### Split dataset

In [15]:
dataset = {
    'train': dataset,
   # 'validation': val_dataset
}


In [16]:
import toml
# TOML-formatted string
config_toml = """
PREFIX              = 'DT'
LOG_INTERVAL        = 5
save_steps          = 50
num_train_epochs    = 1
per_device_train_batch_size=64
learning_rate       = 0.0001
weight_decay        = 0.0001
warmup_ratio        = 0.1
max_grad_norm       = 0.25
"""

config_toml = toml.loads(config_toml)

LOAD_SAVED_MODEL    = False

RUN_NUM = 17
WANDB_ID            = "dt_"+str(RUN_NUM)
WNDB_NAME           = "DT_"+str(RUN_NUM)
MODEL_SAVE_NAME     = WNDB_NAME
SAVED_MODEL_VERSION = "latest"

os.environ["WANDB_DISABLED"] = "false"
os.environ['WANDB_NOTEBOOK_NAME'] = 'DT.ipynb'
os.environ['WANDB_MODE']='online'
os.environ["WANDB_LOG_MODEL"] = "checkpoint"


wandb.login(key="f060d3284088ffaf4624e2de8b236f39711a99a2")
wandb.init(resume=WANDB_ID,
           name = WNDB_NAME,
           mode="online",
           entity="yakiv",
            project="CarRacingDT",
            #resume= "allow"
            config=config_toml
           )


env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'




### Train

In [17]:


collator = DecisionTransformerGymDataCollator(dataset["train"])

config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim)
print(config.to_dict())

model = TrainableDT(config)


training_args = TrainingArguments(
    output_dir="output/",
    report_to="wandb",
    save_steps=config_toml["save_steps"],
    remove_unused_columns=False,
    optim="adamw_torch",
    num_train_epochs=config_toml["num_train_epochs"],
    per_device_train_batch_size=config_toml["per_device_train_batch_size"],
    learning_rate=config_toml["learning_rate"],
    weight_decay=config_toml["weight_decay"],
    warmup_ratio=config_toml["warmup_ratio"],
    max_grad_norm=config_toml["max_grad_norm"],
    logging_steps=config_toml["LOG_INTERVAL"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=collator,

)

trainer.train()


{'action_num': 5, 'state_dim': 27648, 'act_dim': 1, 'hidden_size': 128, 'max_ep_len': 4096, 'action_tanh': True, 'vocab_size': 1, 'n_positions': 1024, 'n_layer': 3, 'n_head': 1, 'n_inner': None, 'activation_function': 'relu', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diver

