In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.ndimage as ndimage

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data.dataset import Dataset, random_split
from typing import Callable

import gym
from gym import spaces
from stable_baselines3 import PPO, A2C, SAC
from stable_baselines3.common.cmd_util import make_vec_env
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.policies import ActorCriticPolicy

In [None]:
from motion_env import MPEnv
from RL_env import RLEnv,CustomCNN,CustomActorCriticPolicy
from generate_env import generate_env, make_env, ExpertDataSet
from pretrain import pretrain_agent

In [None]:
env_train_num = 1000
opt_num = 10
sup_dim = 30
ob_num = 10
limit = np.array([10,20])

### generate easy train environments ###
env_train_easy_list1 = []
for i in range(env_train_num):
    env_train_easy_list1.append(generate_env(ob_num,limit,opt_num, sup_dim = sup_dim))
env_train_easy_list = [make_env(env,i) for i,env in enumerate(env_train_easy_list1)] 
env_train_easy = DummyVecEnv(env_train_easy_list)

In [None]:
env = env_train_easy_list1[2]
obs = env.reset()
plt.imshow(obs[1])

In [None]:
### generate supervision data ###
env_train = env_train_easy
mode = 'generate'
if mode == 'generate':
    exp_data = env_train.env_method('supervision')
    exp_obs = [i[0] for i in exp_data]
    exp_act = [i[1] for i in exp_data]
    exp_obs = np.concatenate(np.stack(exp_obs)).astype(np.uint8)
    exp_act = np.concatenate(np.stack(exp_act)).astype(np.float32)
    np.save('obs1e6',exp_obs)
    np.save('act1e6',exp_act)
if mode == 'load':
    exp_obs = np.load('obs1e6.npy')
    exp_act = np.load('act1e6.npy')
exp_data = ExpertDataSet(exp_obs, exp_act)
train_size = int(0.95 * len(exp_data))
test_size = len(exp_data) - train_size
exp_train, exp_test = random_split(exp_data, [train_size, test_size])
### generate supervision data ###

In [None]:
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=128),
)

In [None]:
student = PPO(CustomActorCriticPolicy, env_train_easy, n_steps=40, policy_kwargs=policy_kwargs, verbose=1)
pretrain_agent(student,
               exp_train,
               exp_test,
               batch_size=64,
               epochs=10,
               scheduler_gamma=0.7,
               learning_rate=1.0,
               log_interval=100,
               no_cuda=False,
               seed=1,
               test_batch_size=16)

In [None]:
student.learn(80000)

In [None]:
env_test_sample = generate_env(10,np.array([10,20]),10)
#env_test_sample  = env_train_easy_list1[12]
obs = env_test_sample.reset()
x = env_test_sample.pos
plt.imshow(env_test_sample.MPEnv.dis)
for i in range(50):
    x = x - 0.1*env_test_sample.MPEnv.ob_der_fun(x)
x = env_test_sample.MPEnv.all_points(x,0)
x = env_test_sample.MPEnv.real2pix(x)
plt.plot(x.T[1],x.T[0],'v-w')

for step in range(50):
    action, _ = student.predict(obs, deterministic=True)
    obs, reward, done, info = env_test_sample.step(action)
    #if done:
    #    print("Goal reached!", "reward=", reward,"step=",step)
    #    break
x1 = env_test_sample.pos
x1 = env_test_sample.MPEnv.all_points(x1,0)
x1 = env_test_sample.MPEnv.real2pix(x1)
plt.plot(x1.T[1],x1.T[0],'o-r')

In [None]:
# successful rate evaluation
env_test_easy_num = 1000
env_test_hard_num = 100

# generate easy test environments
env_test_easy_list1 = []
for i in range(env_test_easy_num):
    env_test_easy_list1.append(generate_env(10,np.array([10,20]),10))
# generate hard test environments
env_test_hard_list1 = []
count = 0
while(count<env_test_hard_num):
    env_try = generate_env(10,np.array([10,20]),10)
    x0 = env_try.MPEnv.initial()
    for j in range(200):
        x0 = x0 - 0.1*env_try.MPEnv.ob_der_fun(x0)
    if not env_try.MPEnv.collision(x0):
        env_test_hard_list1.append(env_try)
        count += 1
        print(count,'hard cases for test found')
    env_try.close()

In [None]:
### easy test benchmark ###
n_steps = 200
lr = 0.1
result_easy = np.zeros((4,))
i = 0
for env_test in env_test_easy_list1:
    obs = env_test.reset()
    x0 = env_test.pos
    for step in range(n_steps):
        x0 = x0 - lr*env_test.MPEnv.ob_der_fun(x0)
    for step in range(n_steps):
        action, _ = student.predict(obs, deterministic=True)
        obs, reward, done, info = env_test.step(action)
        if done:
            #print("Goal reached!", "reward=", reward,"step=",step)
            break
    if done and env_test.MPEnv.collision(x0):
        result_easy[0] += 1
    if not done and not env_test.MPEnv.collision(x0):
        result_easy[1] += 1
    if done and not env_test.MPEnv.collision(x0):
        result_easy[2] += 1
    if not done and env_test.MPEnv.collision(x0):
        result_easy[3] += 1
    env_test.close()
    if (i+1) % 50 == 0:
        print((i+1)/len(env_test_easy_list1),'complete')
    i += 1
result_easy /= len(env_test_easy_list1)
print("result_list_easy:", result_easy)
rl_success = result_easy[0]+result_easy[2]
gd_success = result_easy[0]+result_easy[3]
print('success_rl: %.2f%%'  % (rl_success*100))
print('success_gd: %.2f%%'  % (gd_success*100))

In [None]:
### hard test benchmark ###
n_steps = 200
lr = 0.1
result_hard_GD = 0
result_hard_RL = 0
for env_test in env_test_hard_list1:
    obs = env_test.reset()
    x0 = env_test.pos
    for step in range(n_steps):
        action, _ = student.predict(obs, deterministic=True)
        obs, reward, done, info = env_test.step(action)
        if done:
            result_hard_RL += 1
            break
    env_test.close()
result_hard_RL /= len(env_test_hard_list1)
print("result_list_hard_RL", result_hard_RL)