### Install initial environment in Google Colab

In [1]:
import sys
import os

if 'google.colab' in sys.modules:
  if not os.path.exists('/content/.already_installed'):
    !git clone https://github.com/FlutterbaseDotCom/hdt

    !apt-get install -y swig
    !pip install box2d-py
    !pip install 'gymnasium[box2d]'
    !pip install 'stable-baselines3[extra]'
    !pip install toml
    !pip install wandb
    !pip install datasets
    !pip install transformers
    !pip install torchviz
    !pip install accelerate -U

    with open('/content/.already_installed', 'w') as f:
        f.write('done')
  %cd /content/hdt


### Imports

In [2]:
import os
import random
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torchviz
import wandb
from datasets import Dataset, load_dataset
from stable_baselines3 import DQN
from stable_baselines3.common.torch_layers import NatureCNN
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from torch.utils.data import Subset
from transformers import Trainer, TrainingArguments

from dt.configuration_decision_transformer import DecisionTransformerConfig
from dt.modeling_decision_transformer import DecisionTransformerModel
from extract_cnn import prepare_observation_array
from dt.trainable_dt import DecisionTransformerGymDataCollator, TrainableDT





  from .autonotebook import tqdm as notebook_tqdm


### Generate Data

In [4]:
from utils.config import MAX_EPISODE_STEPS, NUM_EPISODES


env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'
model = DQN
tmp_model_path ='./models/dql_pretrained/dql_rl_11.zip'
loaded_model = model.load(tmp_model_path)

features = {
    "observations": [],
    "actions": [],
    "rewards": [],
    "dones": [],
}
for episode in range(NUM_EPISODES):
    print(f"Episode: {episode} of {NUM_EPISODES}:" )
    [obs, _] = env.reset()
    done = False

    o, a, r, d = [], [], [], []
    total_reward = 0
    sti = 0
    while not done:
        sti = sti + 1
        if sti > MAX_EPISODE_STEPS:
            break

        # if random.random() < epsilon:
        #     action = 3# env.action_space.sample()
        # else:
        action, _states = loaded_model.predict(obs,deterministic=True)
        new_obs, reward, done, t, i = env.step(action)
        total_reward = total_reward + reward
        oarr = prepare_observation_array(obs)
        o.append(oarr.flatten())
        a.append(action.item())
        r.append(reward)
        d.append(done)
        obs = new_obs
        print(".", end="")

        # check if last 50 steps does not contain a single positive reward
        if len(r) > 100 and max(r[-50:]) <= 0:
            # cut last 50 and set done to True
            r = r[:-50]
            d = d[:-50]
            a = a[:-50]
            d[-1] = True
            print('\nstopping due to the last 50 steps not negative rewards')
            break
    print(f"\nTotal reward: {total_reward} episodes steps: {len(o)}")

    features["observations"].append(o)
    features["actions"].append(a)
    features["rewards"].append(r)
    features["dones"].append(d)

env.close()
print(len(features["actions"]))






Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>


Episode: 0 of 70:
.............................................
Total reward: 6.2142857142857295 episodes steps: 45
Episode: 1 of 70:
.............................................
Total reward: 9.785714285714302 episodes steps: 45
Episode: 2 of 70:
.............................................
Total reward: 9.437282229965172 episodes steps: 45
Episode: 3 of 70:
.............................................
Total reward: 6.7781954887218205 episodes steps: 45
Episode: 4 of 70:
.............................................
Total reward: 9.245704467353967 episodes steps: 45
Episode: 5 of 70:
.............................................
Total reward: 4.875000000000007 episodes steps: 45
Episode: 6 of 70:
.............................................
Total reward: 5.601010101010118 episodes steps: 45
Episode: 7 of 70:
.............................................
Total reward: 3.43650793650795 episodes steps: 45
Episode: 8 of 70:
.............................................
Total reward: 9

### Persist Dataset

In [5]:
dataset = Dataset.from_dict(features)
#dataset.save_to_disk('datasets/car_racing_0070_0045/')

# dataset_size = len(dataset)
# split_point = int(0.9 * dataset_size)
# #dataset is already shuffled
# train_dataset = Subset(dataset, range(split_point))
# val_dataset = Subset(dataset, range(split_point, dataset_size))

#from datasets import load_from_disk
#dataset = load_from_disk('datasets/car_racing_002/')


### Split dataset

In [6]:
dataset = {
    'train': dataset,
   # 'validation': val_dataset
}


In [7]:
from utils.config import CONFIG, WANDB_ID, WNDB_NAME
# TOML-formatted string


os.environ["WANDB_DISABLED"] = "false"
os.environ['WANDB_NOTEBOOK_NAME'] = 'DT.ipynb'
os.environ['WANDB_MODE']='online'
os.environ["WANDB_LOG_MODEL"] = "checkpoint"


wandb.login(key="f060d3284088ffaf4624e2de8b236f39711a99a2")
wandb.init(resume=WANDB_ID,
           name = WNDB_NAME,
           mode="online",
           entity="yakiv",
            project="CarRacingDT",
            #resume= "allow"
            config=CONFIG
           )


env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'


[34m[1mwandb[0m: Currently logged in as: [33myakiv[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/jacob/.netrc


### Train

In [8]:

collator = DecisionTransformerGymDataCollator(dataset["train"], max_len=CONFIG["max_length"],   max_ep_len=CONFIG["max_ep_len"],)

dt_config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim,
                                      max_length = CONFIG["max_length"],
                                      max_ep_len = CONFIG["max_ep_len"],  
                                      )
print(dt_config.to_dict())

model = TrainableDT(dt_config)


training_args = TrainingArguments(
    output_dir="output/",
    report_to="wandb",
    save_steps=CONFIG["save_steps"],
    remove_unused_columns=False,
    optim="adamw_torch",
    num_train_epochs=CONFIG["num_train_epochs"],
    per_device_train_batch_size=CONFIG["per_device_train_batch_size"],
    learning_rate=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
    warmup_ratio=CONFIG["warmup_ratio"],
    max_grad_norm=CONFIG["max_grad_norm"],
    logging_steps=CONFIG["LOG_INTERVAL"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=collator,

)

trainer.train()


{'action_num': 6, 'state_dim': 27648, 'act_dim': 1, 'hidden_size': 128, 'max_ep_len': 1000, 'action_tanh': True, 'vocab_size': 1, 'n_positions': 1024, 'n_layer': 3, 'n_head': 1, 'n_inner': None, 'activation_function': 'relu', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 10, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diver

  0%|          | 0/9 [00:00<?, ?it/s]

Collator sz:8: started at 10:21:16
Collator sz:8: finished at 10:21:22 and took 5465ms
Collator sz:8: started at 10:21:27
Collator sz:8: finished at 10:21:32 and took 5369ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:21:33
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:21:34 and took 1383ms
Trainable Forward pass: started at 10:21:34
loss: 1.6033169031143188
Trainable Forward pass: finished at 10:21:34 and took 219ms


 11%|█         | 1/9 [00:25<03:23, 25.41s/it]

Collator sz:8: started at 10:21:40
Collator sz:8: finished at 10:21:46 and took 5321ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:21:46
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:21:46 and took 91ms
Trainable Forward pass: started at 10:21:46
loss: 1.6081148386001587
Trainable Forward pass: finished at 10:21:46 and took 87ms


 22%|██▏       | 2/9 [00:36<01:58, 16.91s/it]

Collator sz:8: started at 10:21:51
Collator sz:8: finished at 10:21:56 and took 4966ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:21:56
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:21:56 and took 95ms
Trainable Forward pass: started at 10:21:56
loss: 1.5031492710113525
Trainable Forward pass: finished at 10:21:57 and took 91ms


 33%|███▎      | 3/9 [00:46<01:23, 13.91s/it]

Collator sz:8: started at 10:22:02
Collator sz:8: finished at 10:22:06 and took 4876ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:22:07
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:22:07 and took 91ms
Trainable Forward pass: started at 10:22:07
loss: 1.4670196771621704
Trainable Forward pass: finished at 10:22:07 and took 97ms


 44%|████▍     | 4/9 [00:56<01:02, 12.43s/it]

Collator sz:8: started at 10:22:12
Collator sz:8: finished at 10:22:17 and took 4957ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:22:17
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:22:17 and took 91ms
Trainable Forward pass: started at 10:22:17
loss: 1.3986486196517944
Trainable Forward pass: finished at 10:22:17 and took 13ms


 56%|█████▌    | 5/9 [01:07<00:46, 11.69s/it]

{'loss': 1.516, 'learning_rate': 5e-05, 'epoch': 0.56}
Collator sz:8: started at 10:22:22
Collator sz:8: finished at 10:22:27 and took 4875ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:22:27
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:22:27 and took 95ms
Trainable Forward pass: started at 10:22:27
loss: 1.416059136390686
Trainable Forward pass: finished at 10:22:27 and took 86ms


 67%|██████▋   | 6/9 [01:17<00:33, 11.20s/it]

Collator sz:8: started at 10:22:32
Collator sz:8: finished at 10:22:37 and took 5057ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:22:38
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:22:38 and took 90ms
Trainable Forward pass: started at 10:22:38
loss: 1.3473986387252808
Trainable Forward pass: finished at 10:22:38 and took 82ms


 78%|███████▊  | 7/9 [01:27<00:21, 10.92s/it]

Collator sz:6: started at 10:22:42
Collator sz:6: finished at 10:22:45 and took 3741ms
DecisionTransformerModel.forward batch sz:8 seq len: 10 : started at 10:22:45
DecisionTransformerModel.forward batch sz:8 seq len: 10 : finished at 10:22:46 and took 87ms
Trainable Forward pass: started at 10:22:46
loss: 1.5513800382614136
Trainable Forward pass: finished at 10:22:46 and took 46ms


 89%|████████▉ | 8/9 [01:35<00:09,  9.95s/it]

DecisionTransformerModel.forward batch sz:6 seq len: 10 : started at 10:22:46
DecisionTransformerModel.forward batch sz:6 seq len: 10 : finished at 10:22:46 and took 326ms
Trainable Forward pass: started at 10:22:46
loss: 1.4486445188522339
Trainable Forward pass: finished at 10:22:46 and took 116ms


100%|██████████| 9/9 [01:36<00:00,  7.10s/it]

{'train_runtime': 96.5809, 'train_samples_per_second': 0.725, 'train_steps_per_second': 0.093, 'train_loss': 1.4826368755764432, 'epoch': 1.0}


100%|██████████| 9/9 [01:38<00:00, 10.91s/it]


TrainOutput(global_step=9, training_loss=1.4826368755764432, metrics={'train_runtime': 96.5809, 'train_samples_per_second': 0.725, 'train_steps_per_second': 0.093, 'train_loss': 1.4826368755764432, 'epoch': 1.0})