### Install initial environment in Google Colab

In [1]:
import sys
import os

if 'google.colab' in sys.modules:
  if not os.path.exists('/content/.already_installed'):
    !git clone https://github.com/FlutterbaseDotCom/hdt

    !apt-get install -y swig
    !pip install box2d-py
    !pip install 'gymnasium[box2d]'
    !pip install 'stable-baselines3[extra]'
    !pip install toml
    !pip install wandb
    !pip install datasets
    !pip install transformers
    !pip install torchviz
    !pip install accelerate -U

    with open('/content/.already_installed', 'w') as f:
        f.write('done')
  %cd /content/hdt


### Imports

In [2]:
import os
import random
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torchviz
import wandb
from datasets import Dataset, load_dataset
from stable_baselines3 import DQN
from stable_baselines3.common.torch_layers import NatureCNN
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from torch.utils.data import Subset
from transformers import Trainer, TrainingArguments

from dt.configuration_decision_transformer import DecisionTransformerConfig
from dt.modeling_decision_transformer import DecisionTransformerModel
from extract_cnn import prepare_observation_array
from dt.trainable_dt import DecisionTransformerGymDataCollator, TrainableDT





  from .autonotebook import tqdm as notebook_tqdm


In [5]:
!pip install h5py


Collecting h5py
  Downloading h5py-3.10.0-cp310-cp310-macosx_10_9_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: h5py
Successfully installed h5py-3.10.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
import h5py
import numpy as np

def save_to_hdf5(data, filename, use_compression=False):
    with h5py.File(filename, 'w') as hdf5_file:
        for key, value in data.items():
            # Create a group for each key in the dictionary
            group = hdf5_file.create_group(key)
            for i, array in enumerate(value):
                # Store each array in the group, assigning an indexed name
                if use_compression:
                    group.create_dataset(str(i), data=array, compression="gzip")
                else:
                    group.create_dataset(str(i), data=array)

def load_from_hdf5(filename):
    data = {}
    with h5py.File(filename, 'r') as hdf5_file:
        for key in hdf5_file.keys():
            # Recreate the list of arrays from the group's datasets
            group = hdf5_file[key]
            data[key] = [np.array(group[str(i)]) for i in range(len(group))]
    return data


In [12]:
save_to_hdf5(features, './temp_data/features_unk.hdf5', use_compression=False)

In [8]:
f1 = load_from_hdf5('./temp_data/features.hdf5')

In [9]:
print(f1)

{'actions': [array([3, 4, 4, 4, 2, 2, 0, 2, 3, 2, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 3, 3, 3, 1, 1, 0, 2, 2, 2, 2, 2, 0, 2,
       0, 0, 3, 0, 0, 0, 0, 3, 3, 3, 3, 3, 1, 1, 1, 3, 0, 0, 3, 3, 0, 0,
       0, 1, 1, 3, 3, 0, 3, 0, 0, 0, 0, 1, 1, 3, 0, 2, 0, 0, 0, 1, 1, 1,
       3, 3, 2, 2, 2, 3, 2, 0, 4, 1, 3, 3]), array([3, 4, 4, 4, 4, 3, 0, 3, 3, 2, 3, 2, 3, 2, 3, 0, 3, 3, 3, 3, 3, 1,
       1, 1, 1, 3, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 2, 2, 2, 2, 2, 0,
       2, 1, 2, 1, 2, 1, 2, 3, 3, 3, 0, 0, 3, 0, 0, 0, 1, 1, 1, 1, 1, 3,
       3, 2, 2, 0, 3, 0, 0, 0, 0, 1, 1, 1, 3, 3, 2, 3, 0, 0, 0, 0, 1, 1,
       1, 3, 3, 2, 0, 2, 0, 0, 0, 2, 2, 0]), array([4, 3, 2, 2, 4, 2, 0, 2, 3, 3, 2, 1, 2, 0, 0, 4, 4, 0, 2, 2, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 4, 1, 1, 1, 1, 2, 1, 3, 3,
       3, 1, 3, 1, 3, 3, 3, 1, 1, 1, 2, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 3,
       1, 1, 1, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 3, 0, 1, 1, 1, 0, 0,
       3, 3, 3, 2, 2,

In [4]:
print(features)

{'observations': [[array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.39215687, 0.39215687, 0.39215687, ..., 0.        , 0.    

### Generate Data

In [3]:
from utils.config import MAX_EPISODE_STEPS, NUM_EPISODES, RTG_GAMMA


env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'
model = DQN
tmp_model_path ='./models/dql_pretrained/dql_rl_11.zip'
loaded_model = model.load(tmp_model_path)

features = {
    "observations": [],
    "actions": [],
    "rewards": [],
    "dones": [],
    "rtg": []
}

def calculate_rtg(rewards, gamma):
    rtg = np.zeros_like(rewards)
    rtg[-1] = rewards[-1]
    for i in reversed(range(len(rewards) - 1)):
        rtg[i] = rewards[i] + gamma * rtg[i + 1]
    return rtg

for episode in range(NUM_EPISODES):
    print(f"Episode: {episode} of {NUM_EPISODES}:" )
    [obs, _] = env.reset()
    done = False

    o, a, r, d, g = [], [], [], [], []
    total_reward = 0
    sti = 0
    while not done:
        sti = sti + 1
        if sti > MAX_EPISODE_STEPS:
            break

        # if random.random() < epsilon:
        #     action = 3# env.action_space.sample()
        # else:
        action, _states = loaded_model.predict(obs,deterministic=True)
        new_obs, reward, done, t, i = env.step(action)
        total_reward = total_reward + reward
        oarr = prepare_observation_array(obs)
        o.append(oarr.flatten())
        a.append(action.item())
        r.append(reward)
        d.append(done)
        obs = new_obs
        print(".", end="")

        # check if last 50 steps does not contain a single positive reward
        if len(r) > 100 and max(r[-50:]) <= 0:
            # cut last 50 and set done to True
            r = r[:-50]
            d = d[:-50]
            a = a[:-50]
            d[-1] = True
            print('\nstopping due to the last 50 steps not negative rewards')
            break
    print(f"\nTotal reward: {total_reward} episodes steps: {len(o)}")

    features["observations"].append(o)
    features["actions"].append(a)
    features["rewards"].append(r)
    features["dones"].append(d)
    episode_rtg = calculate_rtg(r, RTG_GAMMA)
    features["rtg"].append(episode_rtg)

env.close()
#print(len(features["actions"]))






Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>


Episode: 0 of 20:
....................................................................................................
Total reward: 22.25806451612903 episodes steps: 100
Episode: 1 of 20:
....................................................................................................
Total reward: 16.49006622516558 episodes steps: 100
Episode: 2 of 20:
....................................................................................................
Total reward: 6.835016835016857 episodes steps: 100
Episode: 3 of 20:
....................................................................................................
Total reward: 9.455252918287968 episodes steps: 100
Episode: 4 of 20:
....................................................................................................
Total reward: 23.003300330033 episodes steps: 100
Episode: 5 of 20:
....................................................................................................
Total reward: 14.73498233215

### Persist Dataset

In [16]:
#dataset = Dataset.from_dict(features)

class FeatureDataset(Dataset):
    def __init__(self, src):
        self.size = len(src["observations"])  # Assuming all lists are the same length
        self.src = src

    def __len__(self):
        return self.size
    
    def __getitems__(self, index):
        return [self._item(i) for i in index]

    def _item(self, idx):
        # It is better to ensure this is an internal method used within the class only.
        if isinstance(idx, str):
            return self.src[idx]
        
        return {
            "observations": self.src["observations"][idx],
            "actions": self.src["actions"][idx], 
            "rewards": self.src["rewards"][idx],
            "dones": self.src["dones"][idx],
            "rtg": self.src["rtg"][idx]
        }
    
    def __getitem__(self, index):
        # Here, we use 'index' instead of 'i'
        return self._item(index)

feature_dataset = FeatureDataset(src=features)
len(feature_dataset[0]["observations"][0])
#dataset.save_to_disk('datasets/car_racing_0070_0045/')

# dataset_size = len(dataset)
# split_point = int(0.9 * dataset_size)
# #dataset is already shuffled
# train_dataset = Subset(dataset, range(split_point))
# val_dataset = Subset(dataset, range(split_point, dataset_size))

#from datasets import load_from_disk
#dataset = load_from_disk('datasets/car_racing_002/')


27648

### Split dataset

In [6]:
from utils.config import CONFIG, WANDB_ID, WNDB_NAME
# TOML-formatted string


os.environ["WANDB_DISABLED"] = "false"
os.environ['WANDB_NOTEBOOK_NAME'] = 'DT.ipynb'
os.environ['WANDB_MODE']='online'
os.environ["WANDB_LOG_MODEL"] = "checkpoint"


wandb.login(key="f060d3284088ffaf4624e2de8b236f39711a99a2")
wandb.init(resume=WANDB_ID,
           name = WNDB_NAME,
           mode="online",
           entity="yakiv",
            project="CarRacingDT",
            #resume= "allow"
            config=CONFIG
           )


env =  gym.make('CarRacing-v2', continuous=False) #, render_mode='human'


[34m[1mwandb[0m: Currently logged in as: [33myakiv[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/jacob/.netrc


### Train

In [17]:

collator = DecisionTransformerGymDataCollator(feature_dataset, max_len=CONFIG["max_length"],   max_ep_len=CONFIG["max_ep_len"],)

dt_config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim,
                                      max_length = CONFIG["max_length"],
                                      max_ep_len = CONFIG["max_ep_len"],  
                                      )
print(dt_config.to_dict())

model = TrainableDT(dt_config)


training_args = TrainingArguments(
    output_dir="output/",
    report_to="wandb",
    save_steps=CONFIG["save_steps"],
    remove_unused_columns=False,
    optim="adamw_torch",
    num_train_epochs=CONFIG["num_train_epochs"],
    per_device_train_batch_size=CONFIG["per_device_train_batch_size"],
    learning_rate=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
    warmup_ratio=CONFIG["warmup_ratio"],
    max_grad_norm=CONFIG["max_grad_norm"],
    logging_steps=CONFIG["LOG_INTERVAL"],
)

class DummyDataset(Dataset):
    def __init__(self, size):
        self.size = size
        self.arr = [0] * size

    def __len__(self):
        return self.size
    def __getitems__(self, index):
        print(index)
        return index
    
    def __getitem__(self, index):
        print(index)
        return index
        
        

    

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=DummyDataset(len(feature_dataset)),
    data_collator=collator,

)

trainer.train()


{'action_num': 6, 'state_dim': 27648, 'act_dim': 1, 'hidden_size': 128, 'max_ep_len': 1000, 'action_tanh': True, 'vocab_size': 1, 'n_positions': 1024, 'n_layer': 3, 'n_head': 1, 'n_inner': None, 'activation_function': 'relu', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 10, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diver

  0%|          | 0/2 [00:00<?, ?it/s]

[8, 0, 9, 1, 4, 5, 6, 3]
Collator sz:8: started at 14:38:52
Collator sz:8: finished at 14:38:52 and took 15ms
[2, 7]
Collator sz:2: started at 14:38:52
Collator sz:2: finished at 14:38:52 and took 2ms
DecisionTransformerModel.forward batch on device: mps:0 sz:8 seq len: 10 : started at 14:38:52
DecisionTransformerModel.forward batch on device: mps:0 sz:8 seq len: 10 : finished at 14:38:52 and took 565ms
Trainable Forward pass: started at 14:38:52
loss: 1.6918296813964844
Trainable Forward pass: finished at 14:38:52 and took 129ms


 50%|█████     | 1/2 [00:01<00:01,  2.00s/it]

DecisionTransformerModel.forward batch on device: mps:0 sz:2 seq len: 10 : started at 14:38:54
DecisionTransformerModel.forward batch on device: mps:0 sz:2 seq len: 10 : finished at 14:38:54 and took 364ms
Trainable Forward pass: started at 14:38:54
loss: 1.633987307548523
Trainable Forward pass: finished at 14:38:54 and took 59ms


100%|██████████| 2/2 [00:02<00:00,  1.38s/it]

{'train_runtime': 2.9874, 'train_samples_per_second': 3.347, 'train_steps_per_second': 0.669, 'train_loss': 1.6629085540771484, 'epoch': 1.0}


100%|██████████| 2/2 [00:04<00:00,  2.30s/it]


TrainOutput(global_step=2, training_loss=1.6629085540771484, metrics={'train_runtime': 2.9874, 'train_samples_per_second': 3.347, 'train_steps_per_second': 0.669, 'train_loss': 1.6629085540771484, 'epoch': 1.0})

In [None]:
#play
import matplotlib.pyplot as plt
from IPython.display import display as ipy_display, clear_output
#import gymnasium as gym
# build the environment
max_ep_len = 1000
device = 'cpu'
model = model.to('cpu')
scale = 1000.0  # normalization for rewards/returns
TARGET_RETURN = 900 / scale  # evaluation is conditioned on a return of 12000, scaled accordingly

env =  gym.make('CarRacing-v2', render_mode='rgb_array', continuous=False) #, 

state_dim = 96*96*3
act_dim = 1
# Create the decision transformer model

# Interact with the environment and create a video
episode_return, episode_length = 0, 0
[state, _] = env.reset()
state = prepare_observation_array(state)
target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
actions = torch.zeros((0, act_dim),  device=device, dtype=torch.long)
rewards = torch.zeros(0, device=device, dtype=torch.float32)
print_every = 10
iter = 0

timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)
for t in range(max_ep_len):
    iter += 1
    actions = torch.cat([actions, torch.zeros((1, act_dim), dtype=torch.long,  device=device)], dim=0)
    rewards = torch.cat([rewards, torch.zeros(1, device=device)])

    action = get_action(
        model,
        states,
        actions,
        rewards,
        target_return,
        timesteps,
    )
    
    action =   torch.argmax(action).item() # action.detach().cpu().numpy()
    
    actions[-1] = torch.tensor(action, dtype=torch.long) 

    state, reward, done, _, _ = env.step(action)
    
    if iter % print_every ==0:
      image = env.render()
      clear_output(wait=True)
      plt.imshow(image)
      plt.axis('off')  # Hide the axis
      display(plt.gcf())
    
    

    state = prepare_observation_array(state)
    cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
    states = torch.cat([states, cur_state], dim=0)
    rewards[-1] = reward

    pred_return = target_return[0, -1] - (reward / scale)
    target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
    timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)

    episode_return += reward
    episode_length += 1

    if done:
        break