# 1. Simulation

In [39]:
from string import ascii_uppercase
from draw_utils import *
from pyglet.gl import *
import numpy as np
import pandas as pd
import os
import copy


# reward
# reward
move_reward = -0.1
obs_reward = -0.5
goal_reward = 10
finish_reward = 20
print('reward:' , move_reward, obs_reward, goal_reward)

local_path = '/home/zlxlekta924/YC' #os.path.abspath(os.path.join(os.path.dirname(__file__)))


class Simulator:
    def __init__(self):
        '''
        height : 그리드 높이
        width : 그리드 너비 
        inds : A ~ Q alphabet list
        '''
        # Load train data
        self.files = pd.read_csv(os.path.join(local_path, "./data/factory_order_test.csv")) #"./data/factory_order_train.csv"))
        self.height = 10
        self.width = 9
        self.inds = list(ascii_uppercase)[:17]
        
        self.total_ac = 0

    def set_box(self):
        '''
        아이템들이 있을 위치를 미리 정해놓고 그 위치 좌표들에 아이템이 들어올 수 있으므로 그리드에 100으로 표시한다.
        데이터 파일에서 이번 에피소드 아이템 정보를 받아 가져와야 할 아이템이 있는 좌표만 -100으로 표시한다.
        self.local_target에 에이전트가 이번에 방문해야할 좌표들을 저장한다.
        따라서 가져와야하는 아이템 좌표와 end point 좌표(처음 시작했던 좌표로 돌아와야하므로)가 들어가게 된다.
        '''
        box_data = pd.read_csv(os.path.join(local_path, "data/box.csv"))

        # 물건이 들어있을 수 있는 경우
        for box in box_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(box, "row")][getattr(box, "col")] = 0

        # 물건이 실제 들어있는 경우
        order_item = list(set(self.inds) & set(self.items))
        order_csv = box_data[box_data['item'].isin(order_item)]
        
        for order_box in order_csv.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(order_box, "row")][getattr(order_box, "col")] = 200
            # local target에 가야 할 위치 좌표 넣기
            self.local_target.append(
                [getattr(order_box, "row"),
                 getattr(order_box, "col")]
                )

        #self.local_target.append([9,4]) 
        # 알파벳을 Grid에 넣어서 -> grid에 2Dconv 적용 가능

    def set_obstacle(self):
        '''
        장애물이 있어야하는 위치는 미리 obstacles.csv에 정의되어 있다. 이 좌표들을 0으로 표시한다.
        '''
        obstacles_data = pd.read_csv(os.path.join(local_path, "data/obstacles.csv"))
        for obstacle in obstacles_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(obstacle, "row")][getattr(obstacle, "col")] = 0

    def reset(self, epi):
        '''
        reset()은 첫 스텝에서 사용되며 그리드에서 에이전트 위치가 start point에 있게 한다.

        :param epi: episode, 에피소드 마다 가져와야 할 아이템 리스트를 불러올 때 사용
        :return: 초기셋팅 된 그리드
        :rtype: numpy.ndarray
        _____________________________________________________________________________________
        items : 이번 에피소드에서 가져와야하는 아이템들
        terminal_location : 현재 에이전트가 찾아가야하는 목적지
        local_target : 한 에피소드에서 찾아가야하는 아이템 좌표, 마지막 엔드 포인트 등의 위치좌표들
        actions: visualization을 위해 에이전트 action을 저장하는 리스트
        curloc : 현재 위치
        '''

        # initial episode parameter setting
        self.epi = epi
        self.items = list(self.files.iloc[self.epi])[0]
        self.cumulative_reward = 0
        self.terminal_location = None
        self.local_target = []
        self.actions = []
        self.item_loc = False ## 수정
        
        # initial grid setting
        self.grid = np.ones((self.height, self.width), dtype="float16")

        # set information about the gridworld
        self.set_box()
        self.set_obstacle()

        # start point를 grid에 표시
        self.curloc = [9, 4]
        self.grid[int(self.curloc[0])][int(self.curloc[1])] = 100
        
        self.done = False
        
        return self.grid

    def apply_action(self, action, cur_x, cur_y):
        '''
        에이전트가 행한 action대로 현 에이전트의 위치좌표를 바꾼다.
        action은 discrete하며 4가지 up,down,left,right으로 정의된다.
        
        :param x: 에이전트의 현재 x 좌표
        :param y: 에이전트의 현재 y 좌표
        :return: action에 따라 변한 에이전트의 x 좌표, y 좌표
        :rtype: int, int
        '''
        new_x = cur_x
        new_y = cur_y
        # up
        if action == 0:
            new_x = cur_x - 1
        # down
        elif action == 1:
            new_x = cur_x + 1
        # left
        elif action == 2:
            new_y = cur_y - 1
        # right
        else:
            new_y = cur_y + 1

        return int(new_x), int(new_y)


    def get_reward(self, new_x, new_y, out_of_boundary):
        '''
        get_reward함수는 리워드를 계산하는 함수이며, 상황에 따라 에이전트가 action을 옳게 했는지 판단하는 지표가 된다.

        :param new_x: action에 따른 에이전트 새로운 위치좌표 x
        :param new_y: action에 따른 에이전트 새로운 위치좌표 y
        :param out_of_boundary: 에이전트 위치가 그리드 밖이 되지 않도록 제한
        :return: action에 따른 리워드
        :rtype: float
        '''

        # 바깥으로 나가는 경우
        if any(out_of_boundary):
            reward = obs_reward
                       
        else:
            # 장애물에 부딪히는 경우 
            if self.grid[new_x][new_y] == 0:
                reward = obs_reward  

            # 현재 목표에 도달한 경우
            elif [new_x, new_y] in self.terminal_location:
                if [new_x, new_y] == [9, 4]:
                    reward = finish_reward
                else:
                    reward = goal_reward

            # 그냥 움직이는 경우 
            else:
                reward = move_reward

        return reward

    def step(self, action):
        ''' 
        에이전트의 action에 따라 step을 진행한다.
        action에 따라 에이전트 위치를 변환하고, action에 대해 리워드를 받고, 어느 상황에 에피소드가 종료되어야 하는지 등을 판단한다.
        에이전트가 endpoint에 도착하면 gif로 에피소드에서 에이전트의 행동이 저장된다.

        :param action: 에이전트 행동
        :return:
            grid, 그리드
            reward, 리워드
            cumulative_reward, 누적 리워드
            done, 종료 여부
            goal_ob_reward, goal까지 아이템을 모두 가지고 돌아오는 finish율 계산을 위한 파라미터

        :rtype: numpy.ndarray, float, float, bool, bool/str

        (Hint : 시작 위치 (9,4)에서 up말고 다른 action은 전부 장애물이므로 action을 고정하는 것이 좋음)
        '''

        self.terminal_location = copy.deepcopy(self.local_target)
        cur_x,cur_y = self.curloc
        self.actions.append((cur_x, cur_y))

        goal_ob_reward = False
        
        new_x, new_y = self.apply_action(action, cur_x, cur_y)

        out_of_boundary = [new_x < 0, new_x >= self.height, new_y < 0, new_y >= self.width]

        # 바깥으로 나가는 경우 종료
        if any(out_of_boundary):
            pass
            #self.done = True
            #goal_ob_reward = True
        else:
            # 장애물에 부딪히는 경우 종료
            if self.grid[new_x][new_y] == 0:
                pass
                #self.done = True
                #goal_ob_reward = True

            # 현재 목표에 도달한 경우
            elif [new_x, new_y] in self.terminal_location:

                # end point 일 때
                if [new_x, new_y] == [9,4]:
                    
                    self.done = True
                    self.local_target.remove([new_x, new_y])
                
                # item 일때
                else:
                    self.local_target.remove([new_x, new_y])
                    if not self.local_target:
                        self.local_target.append([9,4])
                        self.grid[9][4] = 200
                
                if self.item_loc: #저번에가 item 이었던 자리었으면
                    self.grid[cur_x][cur_y] = 0
                    self.grid[new_x][new_y] = 100
                else:
                    self.grid[cur_x][cur_y] = 1
                    self.grid[new_x][new_y] = 100

                goal_ob_reward = True
                self.item_loc=True
                
                self.curloc = [new_x, new_y]
            else:
                # 그냥 움직이는 경우
                if self.item_loc:
                    self.grid[cur_x][cur_y] = 0
                    self.grid[new_x][new_y] = 100
                    self.item_loc = False

                else:
                    self.grid[cur_x][cur_y] = 1
                    self.grid[new_x][new_y] = 100
                    
                self.curloc = [new_x,new_y]
                
        reward = self.get_reward(new_x, new_y, out_of_boundary)
        self.cumulative_reward += reward

        if self.done == True:
            if [new_x, new_y] == [9, 4]:
                if self.terminal_location[0] == [9, 4]:
                    pass

                #  # 완료되면 GIFS 저장
#                     self.total_ac += len(self.actions)
#                     if len(self.actions) < 50:
#                         pass
                        
#                         print(f'50번 안에 들어왔다! : {len(self.actions)}')
#                         goal_ob_reward = 'finish'
#                         height = 10
#                         width = 9 
#                         display = Display(visible=False, size=(width, height))
#                         display.start()

#                         start_point = (9, 4)
#                         unit = 50
#                         screen_height = height * unit
#                         screen_width = width * unit
#                         log_path = "./logs"
#                         data_path = "./data"
#                         render_cls = Render(screen_width, screen_height, unit, start_point, data_path, log_path)
#                         for idx, new_pos in enumerate(self.actions):
#                            render_cls.update_movement(new_pos, idx+1)

#                         render_cls.save_gif(self.epi)
#                         render_cls.viewer.close()
#                         display.stop()
#                     else:
#                         pass
# #                         print(f'{len(self.actions)}번 시행 후 완료')
                        
        
        return self.grid, reward, self.cumulative_reward, self.done, goal_ob_reward

    def ac(self):
        return self.total_ac
    def canvas(self):
        goal_ob_reward = 'finish'
        height = 10
        width = 9 
        display = Display(visible=False, size=(width, height))
        display.start()

        start_point = (9, 4)
        unit = 50
        screen_height = height * unit
        screen_width = width * unit
        log_path = "./logs"
        data_path = "./data"
        render_cls = Render(screen_width, screen_height, unit, start_point, data_path, log_path)
        for idx, new_pos in enumerate(self.actions):
            render_cls.update_movement(new_pos, idx+1)

        render_cls.save_gif(self.epi)
        render_cls.viewer.close()
        display.stop()

reward: -0.1 -0.5 10


## 1. Agent 구성


## 1-1 PPO

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical
import numpy as np
import time
from tqdm import tqdm



################################## PPO Policy ##################################
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self,action_dim, has_continuous_action_space, action_std_init):
        super(ActorCritic, self).__init__()

        self.has_continuous_action_space = has_continuous_action_space
        
        if has_continuous_action_space:
            self.action_dim = action_dim
            self.action_var = torch.full((action_dim,), action_std_init * action_std_init)
        # actor
        if has_continuous_action_space :
            self.actor = nn.Sequential(
                            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=2),
                            nn.ReLU(),
                            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
                            nn.ReLU(),
                            nn.Flatten(),
                            nn.Linear(9, 64),
                            nn.Tanh(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            nn.Linear(64, action_dim),
                        )
        else:
            self.actor = nn.Sequential(
                            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
                            nn.ReLU(),
                            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=3),
                            nn.ReLU(),
                            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3),
                            nn.ReLU(),
                            nn.Flatten(),
                            nn.Linear(64, 16),
                            nn.ReLU(),
                            nn.Linear(16, action_dim),
                            nn.Softmax(dim=-1)
                        )
        # critic
        self.critic = nn.Sequential(
                            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
                            nn.ReLU(),
                            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=3),
                            nn.ReLU(),
                            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3),
                            nn.ReLU(),
                            nn.Flatten(),
                            nn.Linear(64, 16),
                            nn.ReLU(),
                            nn.Linear(16,1),
                    )
        
    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")

    def forward(self):
        raise NotImplementedError
    
    def act(self, state):
        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
            dist = MultivariateNormal(action_mean, cov_mat)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    
    def evaluate(self, state, action):

        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            
            action_var = self.action_var.expand_as(action_mean)
            cov_mat = torch.diag_embed(action_var)
            dist = MultivariateNormal(action_mean, cov_mat)
            
            # For Single Action Environments.
            if self.action_dim == 1:
                action = action.reshape(-1, self.action_dim)
        else:
            state = torch.reshape(state, (-1, 1, 10, 9))
            action_probs = self.actor(state)
            dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy


class PPO:
    def __init__(self, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):

        self.has_continuous_action_space = has_continuous_action_space

        if has_continuous_action_space:
            self.action_std = action_std_init

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(action_dim, has_continuous_action_space, action_std_init)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(action_dim, has_continuous_action_space, action_std_init)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()

    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_std = new_action_std
            self.policy.set_action_std(new_action_std)
            self.policy_old.set_action_std(new_action_std)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling PPO::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")

    def decay_action_std(self, action_std_decay_rate, min_action_std):
        print("--------------------------------------------------------------------------------------------")
        if self.has_continuous_action_space:
            self.action_std = self.action_std - action_std_decay_rate
            self.action_std = round(self.action_std, 4)
            if (self.action_std <= min_action_std):
                self.action_std = min_action_std
                print("setting actor output action_std to min_action_std : ", self.action_std)
            else:
                print("setting actor output action_std to : ", self.action_std)
            self.set_action_std(self.action_std)

        else:
            print("WARNING : Calling PPO::decay_action_std() on discrete action space policy")
        print("--------------------------------------------------------------------------------------------")

    def select_action(self, state):

        if self.has_continuous_action_space:
            with torch.no_grad():
                state = torch.FloatTensor(state)
                action, action_logprob = self.policy_old.act(state)

            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)

            return action.detach().cpu().numpy().flatten()
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state)
                action, action_logprob = self.policy_old.act(state)
            
            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)

            return action.item()

    def update(self):
        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach()
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach()
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   
    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))

In [57]:
import os
import glob
import time
from datetime import datetime

import torch
import numpy as np
import time


################################### Training ###################################
def train():
    files = pd.read_csv(os.path.join(local_path, "./data/factory_order_test.csv"))
    
    print("============================================================================================")

    ####### initialize environment hyperparameters ######
    env_name = "Grid World"

    has_continuous_action_space = False  # continuous action space; else discrete

    max_ep_len = 200                   # max timesteps in one episode
    max_training_timesteps = int(3e5)   # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 100        # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2           # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)          # save model frequency (in num timesteps)

    action_std = 0.6                    # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1                # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4      # update policy every n timesteps
    K_epochs = 10               # update policy for K epochs in one PPO update

    eps_clip = 0.2          # clip parameter for PPO
    gamma = 0.99            # discount factor

    lr_actor = 0.00003       # learning rate for actor network 0.0003
    lr_critic = 0.0003       # learning rate for critic network 0.001

    random_seed = 0         # set random seed if required (0 = no random seed)
    #####################################################

    env = Simulator()
    
    action_dim = 4

    ################### checkpointing ###################
    run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
          os.makedirs(directory)


    checkpoint_path = directory + "PPO_{}_{}_{}3.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)

    # initialize a PPO agent
    ppo_agent = PPO(action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")

    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    time_step = 0
    i_episode = 0

    ppo_agent.load(checkpoint_path) # 모델 불러오기
    cnt = 0
    to = 0
    pe = 0
    le = len(files)
    # training loop
    while time_step <= max_training_timesteps:
        # 1 : 16, 2 : 136 ,3 : 680 , 4: 2380 , 5: 6188 , 6: 12261 , 7 : 39999(트레인)
        start = time.time()  # 시작 시간 저장
        for epi in range(le):
            state = env.reset(epi)
            current_ep_reward = 0
            
            for t in range(1, max_ep_len+1):
                # state = torch.FloatTensor(state)
                # state = state.reshape(1,-1)
                state = torch.from_numpy(state).float()
                state = torch.reshape(state, (-1, 1, 10, 9))

                action = ppo_agent.select_action(state)

                state, reward, cumul, done, goal_ob_reward = env.step(action)

                # saving reward and is_terminals
                ppo_agent.buffer.rewards.append(reward)
                ppo_agent.buffer.is_terminals.append(done)

                time_step +=1
                current_ep_reward += reward

                # update PPO agent
                if time_step % update_timestep == 0:
                    ppo_agent.update()

                # printing average reward
#                 if time_step % 20000 == 0:

#                     # print average reward till last episode
#                     print_avg_reward = print_running_reward / print_running_episodes
#                     print_avg_reward = round(print_avg_reward, 2)

#                     print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

#                     print_running_reward = 0
#                     print_running_episodes = 0

                # save model weights
#                 if time_step % save_model_freq == 0:
#                     ppo_agent.save(checkpoint_path)


                # break; if the episode is over
                if done:
                    cnt+=1
                    to +=t
                    break
                if t==max_ep_len:
                    pe +=1
                    print(epi, f'에서 {max_ep_len}번 했는데도 실패')
#                     env.canvas()
                
                
                
            print_running_reward += current_ep_reward
            print_running_episodes += 1

            i_episode += 1
        if epi+1 == le:
            print(f'총 시행 수 : {le}\n성공수 : {cnt}\n성공률 : {(cnt/le)*100}%')
            print(f'평균 행동 수 : {(pe*max_ep_len+to)/le}')
            print("time :", time.time() - start)
            break
            

    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")

if __name__ == '__main__':
    train()


save checkpoint path : PPO_preTrainedPPO_Grid World_0_03.pth
Started training at (GMT) :  2022-06-07 06:57:22
총 시행 수 : 1226
성공수 : 1226
성공률 : 100.0%
평균 행동 수 : 45.451876019575856
time : 46.11190056800842
Started training at (GMT) :  2022-06-07 06:57:22
Finished training at (GMT) :  2022-06-07 06:58:08
Total training time  :  0:00:46
