In [1]:
from experiments.pre_train_machine_policy import run_pre_train_machine
from experiments.utils import *
from agent.agents import *
from agent.switching_agents import *
from environments.env import * 
from definitions import ROOT_DIR

from torch.optim import RMSprop
import numpy as np
import random 
import torch
import pickle

np.random.seed(12345678)
random.seed(12345678)
torch.manual_seed(12345678)


<torch._C.Generator at 0x6373f10>

## Train baselines and 2-stage algorithm

In [2]:
%%time
env_generator = Environment()
width = 3
height = 20
depth = height//3
init_traffic_level = 'light'
env_params = {'width' : width, 'height':height, 'init_traffic_level': init_traffic_level, 'depth': depth}
env_generator_fn = lambda:env_generator.generate_grid_world(**env_params)

n_traj = 10000
n_episodes = 5000

estimation_noise = 2.0
switching_noise = 2.0
human  = NoisyDriverAgent(env_generator, noise_sd=estimation_noise, noise_sw=switching_noise, c_H=0.0)
# gather human trajectories
trajectories = gather_human_trajectories(human, env_generator, n_traj,**env_params)


# define state size 
n_state_features_strings = 1 + depth*(width + 1)

# define state size in 1-hot encoding
n_cell_types = len(env_generator.cell_types)
n_traffic_levels = len(env_generator.traffic_levels)
n_state_features_1hot =  n_cell_types + depth*( n_traffic_levels + 1 + width*(n_cell_types + 1)) 

n_state_features = (n_state_features_strings, n_state_features_1hot)

n_actions = 3

lr = 1e-4
optimizer_fn = lambda params: RMSprop(params, lr)

machine = MachineDriverAgent(n_state_features, n_actions, optimizer_fn, c_M=0.2)
fully_automated = FixedSwitchingMachine(n_state_features, optimizer_fn)



with open(f'{ROOT_DIR}/outputs/human_{estimation_noise}_{switching_noise}_{init_traffic_level}_trajectories_{n_traj}', 'wb') as file:
    pickle.dump(trajectories, file, pickle.HIGHEST_PROTOCOL)

9999
Wall time: 31.5 s


In [3]:
%%time
fully_automated, [human, machine] , fully_automated_costs = run_pre_train_machine(fully_automated, [human, machine], trajectories, env_generator_fn, n_episodes)

2021-05-17 20:45:13.320152, Episode 1000, Fully automated on-policy algorithm cumulative cost: 35535.8
2021-05-17 20:47:13.406020, Episode 2000, Fully automated on-policy algorithm cumulative cost: 72123.8
2021-05-17 20:49:16.207044, Episode 3000, Fully automated on-policy algorithm cumulative cost: 109591.80000000002
2021-05-17 20:51:19.102073, Episode 4000, Fully automated on-policy algorithm cumulative cost: 147059.8
Wall time: 29min 1s


In [51]:
%%time
# run for more 5000 on policy
fully_automated.trainable = True
machine.trainable = True
fully_automated, [human, machine] , fully_automated_costs = run_pre_train_machine(fully_automated, [human, machine], [], env_generator_fn, n_episodes)


2021-05-17 22:38:36.704283, Episode 1000, Fully automated on-policy algorithm cumulative cost: 38895.8
2021-05-17 22:40:36.331125, Episode 2000, Fully automated on-policy algorithm cumulative cost: 77009.8
2021-05-17 22:42:36.204982, Episode 3000, Fully automated on-policy algorithm cumulative cost: 115533.80000000002
2021-05-17 22:44:37.588925, Episode 4000, Fully automated on-policy algorithm cumulative cost: 153841.8
Wall time: 10min 17s


In [56]:
from plot.plot_path import PlotPath
from experiments.utils import *
gird_world = env_generator_fn()
plt_path = PlotPath(gird_world, n_try=1)
plt_path1 = PlotPath(gird_world, n_try=1)



cost = learn_evaluate(fully_automated, [human, machine] ,gird_world , 
is_learn=False, plt_path = plt_path)

cost1 = learn_evaluate(fully_automated, [human,  MachineDriverAgent(n_state_features, n_actions, optimizer_fn, c_M=0.2)] ,gird_world , 
is_learn=False, plt_path = plt_path1)

plt_path.plot('../machine_only_light.png')
plt_path1.plot('../machine_dummy_only_light.png')

print(cost, cost1)

49.800000000000026 79.80000000000004


In [48]:
for _ in range(10):
    is_better = 0
    for i in range(100):
        gird_world = env_generator_fn()
        cost = learn_evaluate(fully_automated, [human, machine] ,gird_world , 
        is_learn=False, plt_path = None)

        cost1 = learn_evaluate(fully_automated, [human,  MachineDriverAgent(n_state_features, n_actions, optimizer_fn, c_M=0.2)] ,gird_world , 
        is_learn=False, plt_path = None)

        is_better+= (cost1 >= cost)

    print(is_better/100)

0.51
0.44
0.44
0.46
0.46
0.46
0.39
0.49
0.5
0.48


In [53]:
res = []
for _ in range(10):
    is_better = 0
    for i in range(100):
        gird_world = env_generator_fn()
        cost = learn_evaluate(fully_automated, [human, machine] ,gird_world , 
        is_learn=False, plt_path = None)

        cost1 = learn_evaluate(fully_automated, [human,  MachineDriverAgent(n_state_features, n_actions, optimizer_fn, c_M=0.2)] ,gird_world , 
        is_learn=False, plt_path = None)

        is_better+= (cost1 >= cost)

    print(is_better/100)
    res.append(is_better/100)
print(np.mean(res))

0.56
0.46
0.47
0.55
0.52
0.55
0.51
0.44
0.57
0.6
0.523
