In [1]:
from experiments.train_experiments import train_experiments
from experiments.utils import *
from agent.agents import *
from agent.switching_agents import *
from environments.env import * 
from definitions import ROOT_DIR

from torch.optim import RMSprop
import numpy as np
import random 
import torch
import pickle
from copy import deepcopy


np.random.seed(12345678)
random.seed(12345678)
torch.manual_seed(12345678)


<torch._C.Generator at 0x6733f50>

## Train baselines and 2-stage algorithm

In [2]:
%%time
# configure environment
env_generator = Environment()
width = 3
height = 20
depth = height//3
init_traffic_level = 'light'
env_params = {'width' : width, 'height':height, 'init_traffic_level': init_traffic_level, 'depth': depth}
env_generator_fn = lambda:env_generator.generate_grid_world(**env_params)

n_traj = 20000
n_episodes = 20000

# Human agent
estimation_noise = 2.0
switching_noise = 2.0
human  = NoisyDriverAgent(env_generator, noise_sd=estimation_noise, noise_sw=switching_noise, c_H=0.0)

# Configure switching and machine agents

# define state size 
n_state_features_strings = 1 + depth*(width + 1)

# define state size in 1-hot encoding
n_cell_types = len(env_generator.cell_types)
n_traffic_levels = len(env_generator.traffic_levels)
n_state_features_1hot =  n_cell_types + depth*( n_traffic_levels + 1 + width*(n_cell_types + 1)) 

n_state_features = (n_state_features_strings, n_state_features_1hot)

n_actions = 3

lr = 1e-4
optimizer_fn = lambda params: RMSprop(params, lr)

machine = MachineDriverAgent(n_state_features, n_actions, optimizer_fn, c_M=0.2)
fully_automated = FixedSwitchingMachine(n_state_features, optimizer_fn)





Wall time: 18 ms


In [3]:

with open(f'{ROOT_DIR}/outputs/human_{estimation_noise}_{switching_noise}_{init_traffic_level}_trajectories_{n_traj}', 'rb') as file:
    trajectories = pickle.load(file)
with open(f'{ROOT_DIR}/outputs/human_{estimation_noise}_{switching_noise}_{init_traffic_level}_trajectories_{n_traj}_agent', 'rb') as file:
    human = pickle.load(file)

In [3]:
%%time
# gather human trajectories
trajectories = gather_human_trajectories(human, env_generator, n_traj,**env_params)
# save for later
with open(f'{ROOT_DIR}/outputs/human_{estimation_noise}_{switching_noise}_{init_traffic_level}_trajectories_{n_traj}', 'wb') as file:
    pickle.dump(trajectories, file, pickle.HIGHEST_PROTOCOL)
with open(f'{ROOT_DIR}/outputs/human_{estimation_noise}_{switching_noise}_{init_traffic_level}_trajectories_{n_traj}_agent', 'wb') as file:
    pickle.dump(human, file, pickle.HIGHEST_PROTOCOL)
    

19999
Wall time: 1min 44s


In [4]:
switch_fixed_policies = SwitchingAgent(n_state_features, optimizer_fn, c_M=0.2, c_H=0.0, eps=0.1)
# same initialisation
switch_full = deepcopy(switch_fixed_policies)
# must be deepcopy of machine before training
switch_machine = deepcopy(machine)

algos_to_train = {'fixed_policies': (switch_fixed_policies,[human, machine]), 'switching':( switch_full,[human, switch_machine]) }


In [5]:
f_a = deepcopy(fully_automated)

In [6]:
%%time

algos = {'pre_trained_machine': (fully_automated, [human, machine])}
algos, fully_automated_costs = train_experiments(algos, trajectories[:1000], env_generator_fn, 0)
machine.trainable = False



Wall time: 2min 3s


In [3]:
%%time
fully_automated, [human, machine] , fully_automated_costs = run_pre_train_machine(fully_automated, [human, machine], trajectories, env_generator_fn, n_episodes)

2021-05-17 20:45:13.320152, Episode 1000, Fully automated on-policy algorithm cumulative cost: 35535.8
2021-05-17 20:47:13.406020, Episode 2000, Fully automated on-policy algorithm cumulative cost: 72123.8
2021-05-17 20:49:16.207044, Episode 3000, Fully automated on-policy algorithm cumulative cost: 109591.80000000002
2021-05-17 20:51:19.102073, Episode 4000, Fully automated on-policy algorithm cumulative cost: 147059.8
Wall time: 29min 1s


In [15]:
from plot.plot_path import PlotPath
from experiments.utils import *
gird_world = env_generator_fn()
plt_path = PlotPath(gird_world, n_try=1)
plt_path1 = PlotPath(gird_world, n_try=1)



cost = learn_evaluate(fully_automated, [human, machine] ,gird_world , 
is_learn=False, plt_path = plt_path, machine_only=True)

cost1 = learn_evaluate(f_a, [human,  switch_machine] ,gird_world ,
is_learn=False, plt_path = plt_path1)

plt_path.plot('../machine_only_light.png')
plt_path1.plot('../machine_dummy_only_light.png')

print(cost, cost1)

41.80000000000002 63.80000000000001


In [14]:
res = []
for _ in range(10):
    is_better = 0
    for i in range(100):
        gird_world = env_generator_fn()
        cost = learn_evaluate(fully_automated, [human, machine] ,gird_world , 
        is_learn=False, plt_path = plt_path, machine_only=True)

        cost1 = learn_evaluate(f_a, [human,  switch_machine] ,gird_world , 
        is_learn=False, plt_path = plt_path1)

        is_better+= (cost1 >= cost)

    print(is_better/100)
    res.append(is_better/100)
    print(cost, cost1)
print(np.mean(res))

AttributeError: 'NoneType' object has no attribute 'add_line'

10000 off 15000 on
0.56
0.46
0.47
0.55
0.52
0.55
0.51
0.44
0.57
0.6
0.523

In [8]:
from copy import deepcopy
import torch
a = MachineDriverAgent(n_state_features, n_actions, optimizer_fn, c_M=0.2)
b = deepcopy(a)
torch.all(list(switch_machine.network.parameters())[0] == list(machine.network.parameters())[0])

tensor(False)

In [24]:
(list(machine.network.parameters())[3]).grad

tensor([4.2675e-14, 0.0000e+00, 2.3021e-14])

In [12]:
list(switch_machine.network.parameters())[3],list(machine.network.parameters())[3]

(Parameter containing:
 tensor([0., 0., 0.], requires_grad=True),
 Parameter containing:
 tensor([ 0.0031,  0.0362, -0.0103], requires_grad=True))