In [1]:
from env import *
import random



In [2]:
### generate train environments ###
env_train_num = 100
opt_num = 10
sup_dim = 100
ob_num = 10
limit = np.array([10,20])
lr = 0.1

### generate hard train environments ###
random.seed(0)
env_train_hard_list1 = []
count = 0
while(count<env_train_num):
    env_try = generate_env(ob_num,limit,opt_num,sup_dim)
    x0 = env_try.obj.initial()
    for j in range(200):
        x0 = x0 - lr*env_try.obj.ob_der_fun(x0)
    if not env_try.obj.collision(x0):
        env_train_hard_list1.append(env_try)
        count += 1
        if count % 10 == 0:
            print(count,'hard cases for training found')
    env_try.close()
env_train_hard_list = [make_env(env,i) for i,env in enumerate(env_train_hard_list1)]
env_train_hard = DummyVecEnv(env_train_hard_list)

### generate easy train environments ###
env_train_easy_list1 = []
for i in range(env_train_num):
    env_train_easy_list1.append(generate_env(ob_num,limit,opt_num,sup_dim))
env_train_easy_list = [make_env(env,i) for i,env in enumerate(env_train_easy_list1)]
env_train_easy = DummyVecEnv(env_train_easy_list)
### generate mixed train environments ###
env_train_mix_list1 = env_train_easy_list1[0:50] + env_train_hard_list1[0:50]
env_train_mix_list = [make_env(env,i) for i,env in enumerate(env_train_mix_list1)]
env_train_mix = DummyVecEnv(env_train_mix_list)

10 hard cases for training found
20 hard cases for training found
30 hard cases for training found
40 hard cases for training found
50 hard cases for training found
60 hard cases for training found
70 hard cases for training found
80 hard cases for training found
90 hard cases for training found
100 hard cases for training found


In [3]:
### generate supervision data ###
# train the agent in easy/hard benchmark
env_train = env_train_easy

exp_data = env_train.env_method('supervision')
exp_obs = [i[0] for i in exp_data]
exp_act = [i[1] for i in exp_data]
exp_obs = np.concatenate(exp_obs).astype(np.float32)
exp_act = np.concatenate(exp_act).astype(np.float32)

exp_data = ExpertDataSet(exp_obs, exp_act)
train_size = int(0.8 * len(exp_data))
test_size = len(exp_data) - train_size
exp_train, exp_test = random_split(exp_data, [train_size, test_size])
### generate supervision data ###

In [4]:
train_new_model = True
if train_new_model:
    student = PPO(CustomActorCriticPolicy, env_train, n_steps = 40, gamma=1, verbose=1)
    pretrain_agent(
        student,
        exp_train = exp_train,
        exp_test = exp_test,
        epochs=1,
        scheduler_gamma=0.7,
        learning_rate=1.0,
        log_interval=1000,
        no_cuda=False,
        seed=1,
        batch_size=64)
    student.policy.float()
    student.learn(50000)
    student.save('obs_orig/easy_0.1_128_50k')
else:
    # load existing model
    student = PPO.load('obs_orig/mix_reward_0.1_PN64')

Using cuda device
Test set: Average loss: 0.0002
-----------------------------
| time/              |      |
|    fps             | 214  |
|    iterations      | 1    |
|    time_elapsed    | 18   |
|    total_timesteps | 4000 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 195       |
|    iterations           | 2         |
|    time_elapsed         | 40        |
|    total_timesteps      | 8000      |
| train/                  |           |
|    approx_kl            | 0.0405347 |
|    clip_fraction        | 0.3       |
|    clip_range           | 0.2       |
|    entropy_loss         | -20.7     |
|    explained_variance   | -1.77e+04 |
|    learning_rate        | 0.0003    |
|    loss                 | 3.44e+03  |
|    n_updates            | 10        |
|    policy_gradient_loss | -0.0372   |
|    std                  | 0.678     |
|    value_loss           | 1.01e+04  |
---------------------

In [5]:
## runtime evaluation 
runtime_env_num = 100
ob_num = 10
limit = np.array([10,20])
opt_num = 10
sup_dim = 0
# generate run time evaluation environments
random.seed(1)
time_env_list1 = []
count = 0
while(count < runtime_env_num):
    done1 = False 
    done = False
    env_try = generate_env(ob_num,limit,opt_num,sup_dim)
    x0 = env_try.obj.initial()
    obs = env_try.reset()
    for j in range(100):
        x0 = x0 - 0.1*env_try.obj.ob_der_fun(x0)
        if env_try.obj.collision(x0):
            done1 = True
            break
    for k in range(100):
        action, _ = student.predict(obs, deterministic=True)
        obs, reward, done, info = env_try.step(action)
        if done:
            break
    if done1 and done and j > 10:
        time_env_list1.append(env_try)
        count += 1
        if count%20 == 0:
            print(count,'run time evalution env found')
    env_try.close()

20 run time evalution env found
40 run time evalution env found
60 run time evalution env found
80 run time evalution env found
100 run time evalution env found


In [6]:
import time
env = time_env_list1[1]
x0 = env.obj.initial()
obs = env.reset()
ob_der_fun = env.obj.ob_der_fun
t1 = time.time()
for j in range(1000):
    x0 = x0 - lr*ob_der_fun(x0)
t_gd = time.time()-t1
t1 = time.time()
for j in range(1000):
    action, _ = student.predict(obs, deterministic=True)
    env.pos = action.reshape(10,2) + env.pos

    current_der = env.obj.ob_der_fun(env.pos).flatten()
    current_cost = env.environment.cost_fun(env.pos).flatten()
    current_cost_value = np.array([np.mean(current_cost)])
    obs = np.concatenate((current_der, current_cost, current_cost_value), axis=0).astype(np.float32)
        
t_rl = time.time()-t1
print(t_gd,t_rl)
runtime(student, time_env_list1)

1.2557759284973145 1.9830152988433838

GD success rate: 1.0 GD average iterations= 28.59
RL success rate: 1.0 RL average iterations= 20.23



In [7]:
# successful rate evaluation
env_test_easy_num = 1000
env_test_hard_num = 100

# generate easy test environments
random.seed(2)
env_test_easy_list1 = []
for i in range(env_test_easy_num):
    env_test_easy_list1.append(generate_env(ob_num,limit,opt_num,0))
# generate hard test environments
env_test_hard_list1 = []
count = 0
while(count<env_test_hard_num):
    env_try = generate_env(ob_num,limit,opt_num,sup_dim)
    x0 = env_try.obj.initial()
    free = False
    for j in range(200):
        x0 = x0 - lr*env_try.obj.ob_der_fun(x0)
        if env_try.obj.collision(x0):
            free = True
            break
    if not free:
        env_test_hard_list1.append(env_try)
        count += 1
        print(count,'hard cases for test found')
    env_try.close()

1 hard cases for test found
2 hard cases for test found
3 hard cases for test found
4 hard cases for test found
5 hard cases for test found
6 hard cases for test found
7 hard cases for test found
8 hard cases for test found
9 hard cases for test found
10 hard cases for test found
11 hard cases for test found
12 hard cases for test found
13 hard cases for test found
14 hard cases for test found
15 hard cases for test found
16 hard cases for test found
17 hard cases for test found
18 hard cases for test found
19 hard cases for test found
20 hard cases for test found
21 hard cases for test found
22 hard cases for test found
23 hard cases for test found
24 hard cases for test found
25 hard cases for test found
26 hard cases for test found
27 hard cases for test found
28 hard cases for test found
29 hard cases for test found
30 hard cases for test found
31 hard cases for test found
32 hard cases for test found
33 hard cases for test found
34 hard cases for test found
35 hard cases for test 

In [8]:
### easy test benchmark ###
n_steps = 200
lr = 0.1
result_easy = np.zeros((4,))
i = 0
for env_test in env_test_easy_list1:
    obs = env_test.reset()
    x0 = env_test.pos
    for step in range(n_steps):
        x0 = x0 - lr*env_test.obj.ob_der_fun(x0)
    for step in range(n_steps):
        action, _ = student.predict(obs, deterministic=True)
        obs, reward, done, info = env_test.step(action)
        if done:
            #print("Goal reached!", "reward=", reward,"step=",step)
            break
    if done and env_test.obj.collision(x0):
        result_easy[0] += 1
    if not done and not env_test.obj.collision(x0):
        result_easy[1] += 1
    if done and not env_test.obj.collision(x0):
        result_easy[2] += 1
    if not done and env_test.obj.collision(x0):
        result_easy[3] += 1
    env_test.close()
    if (i+1) % 50 == 0:
        print((i+1)/len(env_test_easy_list1),'complete')
    i += 1
result_easy /= len(env_test_easy_list1)
print("result_list_easy:", result_easy)
rl_success = result_easy[0]+result_easy[2]
gd_success = result_easy[0]+result_easy[3]
print('success_rl: %.2f%%'  % (rl_success*100))
print('success_gd: %.2f%%'  % (gd_success*100))

0.05 complete
0.1 complete
0.15 complete
0.2 complete
0.25 complete
0.3 complete
0.35 complete
0.4 complete
0.45 complete
0.5 complete
0.55 complete
0.6 complete
0.65 complete
0.7 complete
0.75 complete
0.8 complete
0.85 complete
0.9 complete
0.95 complete
1.0 complete
result_list_easy: [0.744 0.153 0.082 0.021]
success_rl: 82.60%
success_gd: 76.50%


In [9]:
### hard test benchmark ###
n_steps = 200
lr = 0.1
result_hard_GD = 0
result_hard_RL = 0
for env_test in env_test_hard_list1:
    obs = env_test.reset()
    x0 = env_test.pos
    for step in range(n_steps):
        x0 = x0 - lr * env_test.obj.ob_der_fun(x0)
        if env_test.obj.collision(x0):
            result_hard_GD += 1
            break
            
    for step in range(n_steps):
        action, _ = student.predict(obs, deterministic=True)
        obs, reward, done, info = env_test.step(action)
        if done:
            result_hard_RL += 1
            break
    env_test.close()
result_hard_GD /= len(env_test_hard_list1)
result_hard_RL /= len(env_test_hard_list1)
print("result_list_hard_GD", result_hard_GD)
print("result_list_hard_RL", result_hard_RL)

result_list_hard_GD 0.0
result_list_hard_RL 0.29


In [10]:
### multi start on easy test benchmark
multi_start_num = 5
result_multi_start = np.zeros((4,))

count = 0
for env_test in env_test_easy_list1:
    GD = False
    RL = False
    i=0
    while(GD==False and i<multi_start_num):
        if i == 0:
            obs = env_test.reset()
        else:
            obs = env_test.reset(random_start = True)
        x0 = env_test.pos
        for step in range(n_steps):
            x0 = x0 - lr*env_test.obj.ob_der_fun(x0)
            if env_test.obj.collision(x0):
                GD = True
                # print('GD finds feasible solution with',i,'trials on',count,'th environment')
                break
        i += 1
    i=0
    while(RL==False and i<multi_start_num):
        if i == 0:
            obs = env_test.reset()
        else:
            obs = env_test.reset(random_start = True)
        x0 = env_test.pos
        for step in range(n_steps):
            action, _ = student.predict(obs, deterministic=True)
            obs, reward, done, info = env_test.step(action)
            if done:
                RL = True
                # print('RL finds feasible solution with',i,'trials on',count,'th environment')
                break
        i += 1

    if GD == True and RL == True:
        result_multi_start[0] += 1
    elif GD == False and RL == False:
        result_multi_start[1] += 1
    elif GD == False and RL == True:
        result_multi_start[2] += 1
    else: 
        result_multi_start[3] += 1
    if (count+1) % 50 == 0:
        print((count+1)/len(env_test_easy_list1),'complete')
    count += 1
    env_test.close()
    
result_multi_start /= len(env_test_easy_list1)
print("result_list_easy_ms:", result_multi_start)
rl_success_multi = result_multi_start[0]+result_multi_start[2]
gd_success_multi = result_multi_start[0]+result_multi_start[3]
print('success_rl: %.2f%% ' % (rl_success_multi*100))
print('success_gd: %.2f%%'  % (gd_success_multi*100))

0.05 complete
0.1 complete
0.15 complete
0.2 complete
0.25 complete
0.3 complete
0.35 complete
0.4 complete
0.45 complete
0.5 complete
0.55 complete
0.6 complete
0.65 complete
0.7 complete
0.75 complete
0.8 complete
0.85 complete
0.9 complete
0.95 complete
1.0 complete
result_list_easy_ms: [0.83  0.088 0.066 0.016]
success_rl: 89.60% 
success_gd: 84.60%


In [11]:
### multi start on hard test benchmark ###
multi_start_num = 5
result_multi_start = np.zeros((4,))

count = 0
for env_test in env_test_hard_list1:
    GD = False
    RL = False
    i=0
    while(GD==False and i<multi_start_num):
        if i == 0:
            obs = env_test.reset()
        else:
            obs = env_test.reset(random_start = True)
        x0 = env_test.pos
        for step in range(n_steps):
            x0 = x0 - lr*env_test.obj.ob_der_fun(x0)
            if env_test.obj.collision(x0):
                GD = True
                break
        i += 1
    i=0
    while(RL==False and i<multi_start_num):
        if i ==0:
            obs = env_test.reset()
        else:
            obs = env_test.reset(random_start = True)
        x0 = env_test.pos
        for step in range(n_steps):
            action, _ = student.predict(obs, deterministic=True)
            obs, reward, done, info = env_test.step(action)
            if done:
                RL = True
                break
        i += 1

    if GD == True and RL == True:
        result_multi_start[0] += 1
    elif GD == False and RL == False:
        result_multi_start[1] += 1
    elif GD == False and RL == True:
        result_multi_start[2] += 1
    else: 
        result_multi_start[3] += 1
    if (count+1) % 50 == 0:
        print((count+1)/len(env_test_hard_list1),'complete')
    count += 1
    env_test.close()
    
result_multi_start /= len(env_test_hard_list1)
print("result_list_hard_ms:", result_multi_start)
rl_success_multi = result_multi_start[0]+result_multi_start[2]
gd_success_multi = result_multi_start[0]+result_multi_start[3]
print('success_rl: %.2f%% ' % (rl_success_multi*100))
print('success_gd: %.2f%%'  % (gd_success_multi*100))

0.5 complete
1.0 complete
result_list_hard_ms: [0.24 0.45 0.28 0.03]
success_rl: 52.00% 
success_gd: 27.00%
