In [1]:
#!/usr/bin/env python3
import cv2
import random
import numpy as np
import argparse
from DRL.evaluator import Evaluator
from utils.util import *
from utils.tensorboard import TensorBoard
import time


In [2]:
exp = os.path.abspath('.').split('/')[-1]
writer = TensorBoard('../train_log/{}'.format(exp))
os.system('ln -sf ../train_log/{} ./log'.format(exp))
# os.system('mkdir ./model')


0

In [3]:
def train(agent, env, evaluate):
    train_times = args.train_times
    env_batch = args.env_batch
    validate_interval = args.validate_interval
    max_step = args.max_step
    debug = args.debug
    episode_train_times = args.episode_train_times
    resume = args.resume
    output = args.output
    time_stamp = time.time()
    step = episode = episode_steps = 0
    tot_reward = 0.
    observation = None
    noise_factor = args.noise_factor
    while step <= train_times:
        step += 1
        episode_steps += 1
        # reset if it is the start of episode
        if observation is None:
            observation = env.reset()
            agent.reset(observation, noise_factor)    
        action = agent.select_action(observation, noise_factor=noise_factor)
        observation, reward, done, _ = env.step(action)
        agent.observe(reward, observation, done, step)
        if (episode_steps >= max_step and max_step):
            if step > args.warmup:
                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    reward, dist = evaluate(env, agent.select_action, debug=debug)
                    if debug: prRed('Step_{:07d}: mean_reward:{:.3f} mean_dist:{:.3f} var_dist:{:.3f}'.format(step - 1, np.mean(reward), np.mean(dist), np.var(dist)))
                    writer.add_scalar('validate/mean_reward', np.mean(reward), step)
                    writer.add_scalar('validate/mean_dist', np.mean(dist), step)
                    writer.add_scalar('validate/var_dist', np.var(dist), step)
                    agent.save_model(output)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            tot_Q = 0.
            tot_value_loss = 0.
            if step > args.warmup:
#                 if step < 10000 * max_step:
#                     lr = (3e-4, 1e-3)
#                 elif step < 20000 * max_step:
#                     lr = (1e-4, 3e-4)
#                 else:
#                     lr = (3e-5, 1e-5)
                if step < 1000 * max_step:
                    lr = (3e-4, 1e-3)
                elif step < 2000 * max_step:
                    lr = (1e-4, 3e-4)
                else:
                    lr = (3e-5, 1e-5)
                for i in range(episode_train_times):
                    Q, value_loss = agent.update_policy(lr)
                    tot_Q += Q.data.cpu().numpy()
                    tot_value_loss += value_loss.data.cpu().numpy()
                writer.add_scalar('train/critic_lr', lr[0], step)
                writer.add_scalar('train/actor_lr', lr[1], step)
                writer.add_scalar('train/Q', tot_Q / episode_train_times, step)
                writer.add_scalar('train/critic_loss', tot_value_loss / episode_train_times, step)
            if debug: prBlack('#{}: steps:{} interval_time:{:.2f} train_time:{:.2f}' \
                .format(episode, step, train_time_interval, time.time()-time_stamp)) 
            time_stamp = time.time()
            # reset
            observation = None
            episode_steps = 0
            episode += 1

In [4]:
import sys
class Arg():
    def __init__(self):
        self.batch_size = 96
        self.max_step = 40
        self.warmup = 400
        self.discount = 0.95**5
        self.rmsize = 800
        self.env_batch = 96
        self.tau = 0.001
        self.noise_factor = 0;
        self.validate_interval = 50
        self.validate_episodes = 5
        self.train_times = 196000
        self.episode_train_times = 10
        self.resume = None
        self.debug = True
        self.output = './model'
        self.seed = 1234

args = Arg()
# parser = argparse.ArgumentParser(description='Learning to Paint')

# # hyper-parameter
# parser.add_argument('--warmup', default=400, type=int, help='timestep without training but only filling the replay memory')
# parser.add_argument('--discount', default=0.95**5, type=float, help='discount factor')
# parser.add_argument('--batch_size', default=96, type=int, help='minibatch size')
# parser.add_argument('--rmsize', default=800, type=int, help='replay memory size')
# parser.add_argument('--env_batch', default=96, type=int, help='concurrent environment number')
# parser.add_argument('--tau', default=0.001, type=float, help='moving average for target network')
# parser.add_argument('--max_step', default=40, type=int, help='max length for episode')
# parser.add_argument('--noise_factor', default=0, type=float, help='noise level for parameter space noise')
# parser.add_argument('--validate_interval', default=50, type=int, help='how many episodes to perform a validation')
# parser.add_argument('--validate_episodes', default=5, type=int, help='how many episode to perform during validation')
# parser.add_argument('--train_times', default=2000000, type=int, help='total traintimes')
# parser.add_argument('--episode_train_times', default=10, type=int, help='train times for each episode')    
# parser.add_argument('--resume', default=None, type=str, help='Resuming model path for testing')
# parser.add_argument('--output', default='./model', type=str, help='Resuming model path for testing')
# parser.add_argument('--debug', dest='debug', action='store_true', help='print some info')
# parser.add_argument('--seed', default=1234, type=int, help='random seed')

# args = parser.parse_args()    
# args.output = get_output_folder(args.output, "Paint")


In [None]:
# np.random.seed(args.seed)
# torch.manual_seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
# if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed)
# random.seed(args.seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
from DRL.ddpg import DDPG
from DRL.multi import fastenv
# fenv = fastenv(args.max_step, args.env_batch, writer)
# agent = DDPG(args.batch_size, args.env_batch, args.max_step, \
#              args.tau, args.discount, args.rmsize, \
#              writer, args.resume, args.output)
# evaluate = Evaluator(args, writer)
fenv = fastenv(args.max_step, args.env_batch, writer)
agent = DDPG(args.batch_size, args.env_batch, args.max_step, \
             args.tau, args.discount, args.rmsize, \
             writer, args.resume, args.output)
evaluate = Evaluator(args, writer)
print('observation_space', fenv.observation_space, 'action_space', fenv.action_space)
train(agent, fenv, evaluate)


loaded 10000 images
loaded 20000 images
loaded 30000 images
loaded 40000 images
loaded 50000 images
loaded 60000 images
loaded 70000 images
loaded 80000 images
loaded 90000 images
loaded 100000 images
loaded 110000 images
loaded 120000 images
loaded 130000 images
loaded 140000 images
loaded 150000 images
loaded 160000 images
loaded 170000 images
loaded 180000 images
loaded 190000 images
loaded 200000 images
finish loading data, 197999 training images, 2001 testing images
observation_space (96, 128, 128, 7) action_space 13


  s0 = torch.tensor(self.state, device='cpu')
  s1 = torch.tensor(state, device='cpu')


[98m #0: steps:40 interval_time:6.17 train_time:0.00[00m
[98m #1: steps:80 interval_time:4.52 train_time:0.00[00m
[98m #2: steps:120 interval_time:4.41 train_time:0.00[00m
[98m #3: steps:160 interval_time:4.40 train_time:0.00[00m
[98m #4: steps:200 interval_time:4.53 train_time:0.00[00m
[98m #5: steps:240 interval_time:4.53 train_time:0.00[00m
[98m #6: steps:280 interval_time:4.40 train_time:0.00[00m
[98m #7: steps:320 interval_time:4.40 train_time:0.00[00m
[98m #8: steps:360 interval_time:4.26 train_time:0.00[00m
[98m #9: steps:400 interval_time:4.16 train_time:0.00[00m
[98m #10: steps:440 interval_time:4.15 train_time:21.76[00m
[98m #11: steps:480 interval_time:4.16 train_time:18.84[00m
[98m #12: steps:520 interval_time:4.15 train_time:18.87[00m
[98m #13: steps:560 interval_time:4.15 train_time:18.88[00m
[98m #14: steps:600 interval_time:4.18 train_time:18.89[00m
[98m #15: steps:640 interval_time:4.17 train_time:18.89[00m
[98m #16: steps:680 interval_