In [1]:
import d3rlpy
import json
import cv2
import numpy as np
from tqdm.notebook import tqdm
from pathlib import Path
from torch import nn
import torch
import torch.nn.functional as F

# 1. Prepare For Count Dataset.

In [2]:
from torch.utils.data import TensorDataset, DataLoader
import torch
from tqdm.notebook import tqdm

detection=False
data_dir = f'./Dataset/detection_{detection}'
traj_list = []
if Path.exists(Path.joinpath(Path(data_dir), Path('dataset.npz'))):
	data = np.load(str(Path.joinpath(Path(data_dir), Path('dataset.npz'))))
	features, cnts, phases, rewards, actions, timeouts, terminals =data['features'], data['cnts'], data['phases'], data['rewards'], data['actions'], data['timeouts'], data['terminals']
else:
	vision_extractor = PreTrainedModifiedResNet().to("cuda")
	if Path.exists(Path(data_dir)):
		traj_list = [traj for traj in os.listdir(data_dir) if 'traj' in traj]
	else:
		processed = process_meta_data('./Dataset/Intersection_camera.json')
		print("Done meta data")
		process_code = process_raw_data(processed, detection=detection)
		traj_list = [traj for traj in os.listdir(data_dir) if 'traj' in traj]
	features_list = []
	cnts_list = []
	phases_list = []
	rewards_list = []
	actions_list = []
	timeouts_list = []
	terminals_list = []
	print(f'Total trajectories: {len(traj_list)}')
	for traj in traj_list:
		data = np.load(str(Path.joinpath(Path(data_dir), Path(traj))))
		imgs, cnts, phases, rewards, actions, timeouts, terminals = data['imgs'], data['cnts'], data['phases'], data['rewards'], data['actions'], data['timeouts'], data['terminals']
		# img is too big to process
		raw_states = imgs.reshape((-1, 3, 1080, 1920))
		raw_data = torch.Tensor(raw_states)
		raw_dataset = TensorDataset(raw_data)
		raw_loader = DataLoader(raw_dataset, batch_size=16)
		# use pre-trained model to extract features
		extracted_list = []
		pbar = tqdm(total=len(raw_loader))
		for b in raw_loader:
			extracted = vision_extractor(b[0].to("cuda"))
			extracted_list.append(extracted) 
			pbar.update(1)
		features = torch.concat(extracted_list, dim=0)
		features = features.reshape(-1, 4, 2048)
		features = features.numpy()
		features_list.append(features)
		cnts_list.append(cnts)
		phases_list.append(phases)
		rewards_list.append(rewards)
		actions_list.append(actions)
		timeouts_list.append(timeouts)
		terminals_list.append(terminals)
	print(features_list)
	features = np.concatenate(features_list, axis=0)
	cnts =  np.concatenate(cnts_list, axis=0)
	phases = np.concatenate(phases_list, axis=0)
	rewards = np.concatenate(rewards_list, axis=0) 
	actions =  np.concatenate(actions_list, axis=0)
	timeouts =  np.concatenate(timeouts_list, axis=0)
	terminals =  np.concatenate(terminals_list, axis=0)
	np.savez(data_dir+ '/dataset.npz', features=features, cnts=cnts, phases=phases,\
		rewards=rewards, actions=actions, timeouts=timeouts, terminals=terminals)

new_actions = []
for i, act in enumerate(actions):
	if act == phases[i]:
		new_actions.append(0)     

	else:
		new_actions.append(1)
actions = np.array(new_actions)

## 2. Re-build dataset with cnt and phases

In [3]:
from utils.transform import one_hot

onehot_phases = one_hot(phases, 4)
states = np.concatenate((cnts, phases.reshape(-1,1)-1), axis=1, dtype=np.float32)
dataset = d3rlpy.dataset.MDPDataset(
    observations=states,
    actions=actions-1,
    rewards=-rewards,
    terminals=terminals,
    timeouts=timeouts,
)

2024-04-05 20:14.01 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(9,)]) reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)])
2024-04-05 20:14.01 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2024-04-05 20:14.01 [info     ] Action size has been automatically determined. action_size=4


##  3. Train a cnt based offline RL model

In [24]:
from d3rlpy.algos import  DQN, DQNConfig
from d3rlpy.metrics.evaluators import TDErrorEvaluator
from d3rlpy.metrics.evaluators import AverageValueEstimationEvaluator
from d3rlpy.preprocessing import StandardObservationScaler


In [39]:
cql = d3rlpy.algos.DiscreteCQLConfig(
    learning_rate=5e-5,
    optim_factory=d3rlpy.models.optimizers.AdamFactory(eps=1e-2 / 32),
    batch_size=32,
    alpha=4.0,
    q_func_factory=d3rlpy.models.q_functions.QRQFunctionFactory(
        n_quantiles=200
    ),
    # observation_scaler=
    target_update_interval=2000,
    reward_scaler=d3rlpy.preprocessing.ClipRewardScaler(-1.0, 1.0),
).create(device="cuda:0")

# env_scorer = d3rlpy.metrics.EnvironmentEvaluator(env, epsilon=0.001)
 
cql.fit(
    dataset,
    n_steps=5000,
	n_steps_per_epoch=5000,
        evaluators={
            # 'td_error': TDErrorEvaluator(),
            'value_scale': AverageValueEstimationEvaluator()
        },
    show_progress =True)
cql.save_policy("cnt_cql_policy3_action_final.pt")

2024-03-27 16:52.35 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(9,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)
2024-03-27 16:52.35 [info     ] Directory is created at d3rlpy_logs\DiscreteCQL_20240327165235
2024-03-27 16:52.35 [debug    ] Building models...            
2024-03-27 16:52.35 [debug    ] Models have been built.       
2024-03-27 16:52.35 [info     ] Parameters                     params={'observation_shape': [9], 'action_size': 4, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'clip', 'params': {'low': -1.0, 'high': 1.0, 'multiplier': 1.0}}, 'learning_rate': 5e-05, 'optim_factory':

Epoch 1/1:   0%|          | 0/5000 [00:00<?, ?it/s]

2024-03-27 16:53.08 [info     ] DiscreteCQL_20240327165235: epoch=1 step=5000 epoch=1 metrics={'time_sample_batch': 0.0008152896881103515, 'time_algorithm_update': 0.005668901443481446, 'loss': 8.324113044834137, 'td_loss': 3.4253884801864625, 'conservative_loss': 1.2246811416983605, 'time_step': 0.006611674165725708, 'value_scale': -1.9055350453160746} step=5000
2024-03-27 16:53.08 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20240327165235\model_5000.d3


In [44]:

dqn = DQNConfig(
    learning_rate=5e-5,
    optim_factory=d3rlpy.models.optimizers.AdamFactory(eps=1e-2 / 32),
    target_update_interval=2000,
    reward_scaler=d3rlpy.preprocessing.ClipRewardScaler(-1.0, 1.0),
    # observation_scaler=StandardObservationScaler()
).create(device="cuda:0")

dqn.fit(dataset,
        # eval_episodes=test_episodes,
    n_steps=50000,
	n_steps_per_epoch=50000,
        evaluators={
            # 'td_error': TDErrorEvaluator(),
            'value_scale': AverageValueEstimationEvaluator()
        })
dqn.save_policy("cnt_policy1_dqn_action_final_50000.pt")

2024-03-27 17:31.26 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(9,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)
2024-03-27 17:31.26 [info     ] Directory is created at d3rlpy_logs\DQN_20240327173126
2024-03-27 17:31.26 [debug    ] Building models...            
2024-03-27 17:31.26 [debug    ] Models have been built.       
2024-03-27 17:31.26 [info     ] Parameters                     params={'observation_shape': [9], 'action_size': 4, 'config': {'type': 'dqn', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'clip', 'params': {'low': -1.0, 'high': 1.0, 'multiplier': 1.0}}, 'learning_rate': 5e-05, 'optim_factory': {'type': 'adam',

Epoch 1/1:   0%|          | 0/50000 [00:00<?, ?it/s]

2024-03-27 17:36.16 [info     ] DQN_20240327173126: epoch=1 step=50000 epoch=1 metrics={'time_sample_batch': 0.0011114589023590088, 'time_algorithm_update': 0.004289542679786682, 'loss': 0.18937634423396085, 'time_step': 0.005757480177879333, 'value_scale': -2.7958215391057983} step=50000
2024-03-27 17:36.16 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20240327173126\model_50000.d3


In [45]:
from d3rlpy.algos import DoubleDQNConfig

# Create Double DQN configuration
double_dqn = DoubleDQNConfig().create(device="cuda:0")

# Fit the model
double_dqn.fit(
    dataset,
    n_steps=50000,
    n_steps_per_epoch=50000,
    evaluators={
        'value_scale': AverageValueEstimationEvaluator()
    }
)

# Save the trained policy
double_dqn.save_policy("cnt_policy1_double_dqn_action_final_50000.pt")

2024-03-27 20:00.03 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(9,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)
2024-03-27 20:00.03 [info     ] Directory is created at d3rlpy_logs\DoubleDQN_20240327200003
2024-03-27 20:00.03 [debug    ] Building models...            
2024-03-27 20:00.04 [debug    ] Models have been built.       
2024-03-27 20:00.04 [info     ] Parameters                     params={'observation_shape': [9], 'action_size': 4, 'config': {'type': 'double_dqn', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'learning_rate': 6.25e-05, 'optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 

Epoch 1/1:   0%|          | 0/50000 [00:00<?, ?it/s]

2024-03-27 20:03.35 [info     ] DoubleDQN_20240327200003: epoch=1 step=50000 epoch=1 metrics={'time_sample_batch': 0.0007376984310150147, 'time_algorithm_update': 0.0032367820024490354, 'loss': 0.040612651078351776, 'time_step': 0.004196608924865722, 'value_scale': -3.2678763409557123} step=50000
2024-03-27 20:03.35 [info     ] Model parameters are saved to d3rlpy_logs\DoubleDQN_20240327200003\model_50000.d3


In [46]:
from d3rlpy.algos import DiscreteSACConfig

# Create SAC configuration
sac = DiscreteSACConfig().create(device="cuda:0")

# Fit the model
sac.fit(
    dataset,
    n_steps=30000,
    n_steps_per_epoch=30000,
    evaluators={
        'value_scale': AverageValueEstimationEvaluator()
    }
)

# Save the trained policy
sac.save_policy("cnt_policy1_sac_action_final_30000.pt")

2024-03-27 20:22.07 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(9,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)
2024-03-27 20:22.07 [info     ] Directory is created at d3rlpy_logs\DiscreteSAC_20240327202207
2024-03-27 20:22.07 [debug    ] Building models...            
2024-03-27 20:22.07 [debug    ] Models have been built.       
2024-03-27 20:22.07 [info     ] Parameters                     params={'observation_shape': [9], 'action_size': 4, 'config': {'type': 'discrete_sac', 'params': {'batch_size': 64, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'actor_learning_rate': 0.0003, 'critic_learning_rate': 0.0003, 'temp_learning_rate'

Epoch 1/1:   0%|          | 0/30000 [00:00<?, ?it/s]

2024-03-27 20:27.13 [info     ] DiscreteSAC_20240327202207: epoch=1 step=30000 epoch=1 metrics={'time_sample_batch': 0.0012546623547871906, 'time_algorithm_update': 0.008601936809221904, 'temp_loss': -0.0013539146754672402, 'temp': 2.1042203056494393, 'critic_loss': 0.012549203067651251, 'actor_loss': -6.126500767902534, 'time_step': 0.010161516888936361, 'value_scale': 8.96070490943061} step=30000
2024-03-27 20:27.13 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteSAC_20240327202207\model_30000.d3
