In [1]:
import tensorflow as tf
import environment as Env
import keras
import numpy as np
import random

In [2]:
# 강화학습 인공신경망
class REINFORCE(tf.keras.Model):
    def __init__(self, action_size):
        super(REINFORCE, self).__init__()
        self.fc1 = keras.layers.Dense(24, activation = 'relu') # 은닉층 (unit 개수 : 30, 활성함수 : ReLU)
        self.fc2 = keras.layers.Dense(24, activation = 'relu') # 은닉층 (unit 개수 : 30, 활성함수 : ReLU)
        self.fc_out = keras.layers.Dense(action_size, activation = 'softmax') # 출력층 (action_size = 5, 상, 하, 좌, 우, 제자리)
        
        
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        policy = self.fc_out(x)
        
        return policy

In [3]:
# 그리드월드 예제에서의 딥살사 에이전트
class REINFORCEAgent:
    def __init__(self, state_size, action_size):
        # 상태의 크기와 행동의 크기 정의
        self.state_size = state_size # 상태의 크기 정의
        self.action_size = action_size # 행동의 크기 정의
        
        # REINFORCE 하이퍼 파라메터
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        
        self.model = REINFORCE(self.action_size)
        self.optimizer = keras.optimizers.Adam(lr = self.learning_rate)
        self.states, self.actions, self.rewards = [], [], [] # 상태, 행동, 보상을 저장하기 위해 list 형식으로 정의
        
    # 정책을 통해 행동 선택
    def get_action(self, state):
        policy = self.model(state)[0] # 현재 상태를 입력해 정책 확률만 출력
        policy = np.array(policy) # list를 numpy.array로 변환
        action = np.random.choice(self.action_size, 1, p = policy)[0] 
        #print("\n\nget action")
        #print("policy : ", policy)
        #print("action : ", action)
        return np.random.choice(self.action_size, 1, p = policy)[0] # 확률을 적용한 random.choice 함수로 0~4 중에 한 수치를 선택
    
    def discount_rewards(self, rewards): # 반환 값 계산 함수(입력 : 저장한 reward, 출력 : 반환값 array)
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))): # 효율적으로 반환값을 계산하기 위해 거꾸로 진행
            running_add = running_add * self.discount_factor + rewards[t] # 반환값 = reward(t번째) + discount factor * 반환값
            discounted_rewards[t] = running_add # 반환값을 discounted_rewards라는 array에서 저장
        return discounted_rewards
    
    def append_sample(self, state, action, reward): # 한 에피소드 동안의 상태, 행동, 보상을 저장
        self.states.append(state[0]) # 상태 저장
        self.rewards.append(reward) # 보상 저장
        act = np.zeros(self.action_size) # 행동을 one hot encoding으로 변환
        act[action] = 1
        self.actions.append(act) # 행동을 저장
        
    def train_model(self): # 정책신경망 업데이트 함수        
        discounted_rewards = np.float32(self.discount_rewards(self.rewards)) # 보상을 discount_rewards 함수를 통해 반환값을 return하고 반환값을 numpy.float32형식으로 변환
        discounted_rewards -= np.mean(discounted_rewards) # 데이터를 Z-score 표준화 방법으로 정규화함(정책 신경망의 업데이트 성능 향상)
        discounted_rewards /= np.std(discounted_rewards)
        
        # 크로스 엔트로피 오류함수 계산
        model_params = self.model.trainable_variables
        
        with tf.GradientTape() as tape:
            tape.watch(model_params)
            policies = self.model(np.array(self.states))
            actions = np.array(self.actions)
            action_prob = tf.reduce_sum(actions * policies, axis = 1)
            cross_entropy = -tf.math.log(action_prob + 1e-5)
            loss = tf.reduce_sum(cross_entropy * discounted_rewards)
            entropy = -policies * tf.math.log(policies)
        
        # 오류함수를 줄이는 방향으로 모델 업데이트
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        self.states, self.actions, self.rewards = [], [], [] # 상태, 행동, 보상 list 초기화
        return np.mean(entropy)

In [None]:
if __name__ == "__main__":
    # 환경과 에이전트 생성
    env = Env.Env(render_speed = 0.01) # 환경 instance 생성 (게임 속도를 0.01로 조정)
    state_size = 15 # 상태 개수 정의
    action_space = [0, 1, 2, 3, 4] # 행동 정의
    action_size = len(action_space) # 행동 개수 정의
    agent = REINFORCEAgent(state_size, action_size) # REINFORCE instance 생성
    
    scores, episodes = [], []
    
    EPISODES = 10 # episode 횟수 정의.
    
    for e in range(EPISODES): 
        done = False
        score = 0
        step = 0
        
        # env 초기화
        state = env.reset() # 환경을 초기화하고 상태를 받음 (list 형식)
        state = np.reshape(state, [1, state_size]) # 상태 list를 (1, 15)의 numpy.array로 변환
        
        while not done: # episode가 끝나지 않으면 계속 실행
            # 몇 번째 스텝인지 확인
            step += 1
            
            # 현재 상태에 대한 행동 선택
            action = agent.get_action(state)
            
            # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
            next_state, reward, done = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            
            # 샘플로 모델 학습
            agent.append_sample(state, action, reward)
            score += reward
            state = next_state
            
            if done: # episode가 끝나면(goal에 도착하면)
                print("step : {:d}".format(step))
                
                # 에피소드마다 정책신경망 업데이트
                entropy = agent.train_model()
                
                # 에피소드마다 학습결과 출력
                print("episode: {:3d} | score: {:3d} | entropy: {:.3f}\n".format(e, score, entropy))                
                #agent.__init__.self.model = tf.zeros(shape=None, name=None)
                
                
        # 10 에피소드마다 모델 저장
        if e % 10 == 0:
            agent.model.save_weights('save_model/model', save_format='tf')



get action
policy :  [0.35128155 0.09236992 0.10998524 0.03013023 0.41623297]
action :  1


get action
policy :  [0.31490862 0.07844752 0.08564157 0.02398107 0.49702123]
action :  0


get action
policy :  [0.31490862 0.07844752 0.08564157 0.02398107 0.49702123]
action :  0


get action
policy :  [0.3030972  0.05606236 0.06645385 0.0161083  0.55827826]
action :  0


get action
policy :  [0.33270317 0.0523723  0.08545759 0.02260525 0.5068616 ]
action :  2


get action
policy :  [0.26233962 0.06839727 0.14166896 0.02008184 0.5075124 ]
action :  2


get action
policy :  [0.26233962 0.06839727 0.14166896 0.02008184 0.5075124 ]
action :  0


get action
policy :  [0.22015318 0.10803002 0.12907319 0.02217794 0.5205657 ]
action :  0


get action
policy :  [0.22015318 0.10803002 0.12907319 0.02217794 0.5205657 ]
action :  2


get action
policy :  [0.21732093 0.14244615 0.24152943 0.03988928 0.35881412]
action :  0


get action
policy :  [0.22956817 0.14700937 0.18256284 0.02707037 0.4137892 ]




get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  2


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  2


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  0


get action
policy :  [0.4997421  0.06193255 0.20959602 0.06173042 0.16699891]
action :  0


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  2


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  4


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  2


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]




get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  0


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  4


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  4


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  4


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  2


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.48558015 0.06787569 0.21178453 0.04540343 0.18935616]
action :  0


get action
policy :  [0.48558015 0.06787569 0.21178453 0.04540343 0.18935616]
action :  4


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]




get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  2


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  4


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  0


get action
policy :  [0.4457747  0.07695714 0.22036237 0.07430881 0.182597  ]
action :  3


get action
policy :  [0.2264157  0.19240333 0.17953989 0.15379092 0.24785022]
action :  0


get action
policy :  [0.23860502 0.17262892 0.2175451  0.16907358 0.20214735]
action :  2


get action
policy :  [0.3022229  0.21720389 0.17711167 0.134653   0.16880848]
action :  3


get action
policy :  [0.2482496  0.22547993 0.16697821 0.10928475 0.2500075 ]




get action
policy :  [0.33752424 0.14342305 0.1485643  0.10330798 0.2671804 ]
action :  1


get action
policy :  [0.3266262  0.13596882 0.19756013 0.12936853 0.21047623]
action :  2


get action
policy :  [0.30756253 0.16172244 0.19987056 0.09211395 0.2387305 ]
action :  2


get action
policy :  [0.3182661  0.11062127 0.27737436 0.11918794 0.17455037]
action :  4


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  3


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  2


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  4


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  3


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]




get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  1


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  4


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  2


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  2


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  2


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  4


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  2


get action
policy :  [0.48558015 0.06787569 0.21178453 0.04540343 0.18935616]
action :  1


get action
policy :  [0.38711426 0.10759474 0.22496842 0.08378063 0.19654198]




get action
policy :  [0.3182661  0.11062127 0.27737436 0.11918794 0.17455037]
action :  0


get action
policy :  [0.4411158  0.05864149 0.30724826 0.0755465  0.11744794]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  2


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  2


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  0


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  4


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]




get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  0


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  4


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  2


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  1


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  0


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]




get action
policy :  [0.2740519  0.12138977 0.22653458 0.19830309 0.17972073]
action :  1


get action
policy :  [0.31388003 0.0905252  0.22699481 0.13472317 0.23387676]
action :  4


get action
policy :  [0.31388003 0.0905252  0.22699481 0.13472317 0.23387676]
action :  0


get action
policy :  [0.33900347 0.08023636 0.18459103 0.09422676 0.30194232]
action :  3


get action
policy :  [0.30839485 0.11105863 0.14441355 0.08210921 0.35402378]
action :  1


get action
policy :  [0.24282971 0.17213148 0.173012   0.05352458 0.35850227]
action :  2


get action
policy :  [0.28312197 0.16826613 0.14058274 0.03443081 0.37359834]
action :  0


get action
policy :  [0.25284278 0.21856017 0.14510399 0.03470916 0.3487839 ]
action :  4


get action
policy :  [0.3095467  0.18828888 0.15957494 0.05259145 0.28999805]
action :  4


get action
policy :  [0.3830008  0.11647435 0.20620337 0.05655083 0.23777066]
action :  2


get action
policy :  [0.3830008  0.11647435 0.20620337 0.05655083 0.23777066]




get action
policy :  [0.21732093 0.14244615 0.24152943 0.03988928 0.35881412]
action :  4


get action
policy :  [0.2069721  0.1741417  0.25381207 0.05434602 0.31072807]
action :  3


get action
policy :  [0.24490131 0.17586768 0.24217384 0.06842518 0.26863202]
action :  1


get action
policy :  [0.28941402 0.15258566 0.19598688 0.0744272  0.28758624]
action :  0


get action
policy :  [0.28941402 0.15258566 0.19598688 0.0744272  0.28758624]
action :  1


get action
policy :  [0.35075176 0.13305528 0.1908779  0.10019398 0.22512105]
action :  3


get action
policy :  [0.3967814  0.11119939 0.19585502 0.06573029 0.23043393]
action :  0


get action
policy :  [0.46824723 0.08067486 0.20245336 0.05547854 0.19314599]
action :  3


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]




get action
policy :  [0.33463877 0.15953851 0.13510562 0.0913944  0.2793227 ]
action :  4


get action
policy :  [0.3136857  0.18499054 0.14841801 0.04907484 0.30383086]
action :  1


get action
policy :  [0.3136857  0.18499054 0.14841801 0.04907484 0.30383086]
action :  4


get action
policy :  [0.3095467  0.18828888 0.15957494 0.05259145 0.28999805]
action :  0


get action
policy :  [0.25858232 0.2163027  0.15836757 0.07928483 0.2874625 ]
action :  0


get action
policy :  [0.22101034 0.23245451 0.18978676 0.16749494 0.18925346]
action :  3


get action
policy :  [0.22928473 0.1884141  0.25984296 0.11249126 0.20996696]
action :  2


get action
policy :  [0.24963896 0.19585519 0.23179427 0.1686199  0.15409166]
action :  2


get action
policy :  [0.22587025 0.24702595 0.1873425  0.18625516 0.15350619]
action :  2


get action
policy :  [0.20795406 0.18778202 0.21495096 0.2665435  0.12276945]
action :  1


get action
policy :  [0.21524604 0.16892886 0.21119009 0.25111267 0.15352228]




get action
policy :  [0.35596755 0.09821282 0.25654566 0.12776606 0.16150793]
action :  0


get action
policy :  [0.35207987 0.07301585 0.32391405 0.15008175 0.10090838]
action :  4


get action
policy :  [0.3213167  0.09387913 0.27222857 0.16778843 0.1447871 ]
action :  1


get action
policy :  [0.42012864 0.08423207 0.23363242 0.08414589 0.17786099]
action :  3


get action
policy :  [0.46824723 0.08067486 0.20245336 0.05547854 0.19314599]
action :  0


get action
policy :  [0.38318348 0.14574474 0.12224452 0.06429102 0.2845363 ]
action :  4


get action
policy :  [0.33463877 0.15953851 0.13510562 0.0913944  0.2793227 ]
action :  2


get action
policy :  [0.25194332 0.2086483  0.17428458 0.07672794 0.2883958 ]
action :  4


get action
policy :  [0.2147786  0.19579571 0.22493863 0.11209799 0.25238904]
action :  0


get action
policy :  [0.2482496  0.22547993 0.16697821 0.10928475 0.2500075 ]
action :  0


get action
policy :  [0.2482496  0.22547993 0.16697821 0.10928475 0.2500075 ]




get action
policy :  [0.3095467  0.18828888 0.15957494 0.05259145 0.28999805]
action :  0


get action
policy :  [0.3830008  0.11647435 0.20620337 0.05655083 0.23777066]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  1


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  2


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]




get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  2


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  4


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  4


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  4


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  4


get action
policy :  [0.3095467  0.18828888 0.15957494 0.05259145 0.28999805]
action :  0


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  2


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]




get action
policy :  [0.23273961 0.20040974 0.20653647 0.16801907 0.19229515]
action :  3


get action
policy :  [0.28617412 0.12013124 0.30827597 0.14909148 0.13632724]
action :  4


get action
policy :  [0.4411158  0.05864149 0.30724826 0.0755465  0.11744794]
action :  2


get action
policy :  [0.49221426 0.0443634  0.29356077 0.07557977 0.09428186]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  2


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  4


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  4


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  0


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]




get action
policy :  [0.33752424 0.14342305 0.1485643  0.10330798 0.2671804 ]
action :  4


get action
policy :  [0.30756253 0.16172244 0.19987056 0.09211395 0.2387305 ]
action :  4


get action
policy :  [0.30756253 0.16172244 0.19987056 0.09211395 0.2387305 ]
action :  0


get action
policy :  [0.48558015 0.06787569 0.21178453 0.04540343 0.18935616]
action :  2


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  4


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]




get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  0


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  2


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  0


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  0


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  1


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  1


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  0


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  2


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]




get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  4


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  3


get action
policy :  [0.4457747  0.07695714 0.22036237 0.07430881 0.182597  ]
action :  2


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  0


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  4


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  2


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  0


get action
policy :  [0.3266262  0.13596882 0.19756013 0.12936853 0.21047623]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]




get action
policy :  [0.38711426 0.10759474 0.22496842 0.08378063 0.19654198]
action :  0


get action
policy :  [0.4411158  0.05864149 0.30724826 0.0755465  0.11744794]
action :  3


get action
policy :  [0.4411158  0.05864149 0.30724826 0.0755465  0.11744794]
action :  4


get action
policy :  [0.49221426 0.0443634  0.29356077 0.07557977 0.09428186]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  4


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  2


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  1


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  4


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  0


get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]




get action
policy :  [0.38226297 0.13269135 0.15303656 0.06354304 0.26846606]
action :  0


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  3


get action
policy :  [0.40475246 0.12009455 0.14536352 0.06817472 0.26161474]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.46464217 0.08207568 0.1709165  0.07828242 0.2040832 ]
action :  0


get action
policy :  [0.4536964  0.06593764 0.2452751  0.07320448 0.16188638]
action :  0


get action
policy :  [0.48558015 0.06787569 0.21178453 0.04540343 0.18935616]
action :  0


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.557934   0.03579684 0.24747938 0.05242918 0.10636067]
action :  0


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  2


get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]




get action
policy :  [0.5724772  0.03491938 0.24902971 0.046352   0.09722167]
action :  4


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.5395013  0.04467595 0.2509095  0.04144753 0.12346572]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  0


get action
policy :  [0.48794115 0.06868863 0.1992838  0.05298097 0.19110544]
action :  4


get action
policy :  [0.3136857  0.18499054 0.14841801 0.04907484 0.30383086]
action :  0


get action
policy :  [0.3136857  0.18499054 0.14841801 0.04907484 0.30383086]
action :  0


get action
policy :  [0.3095467  0.18828888 0.15957494 0.05259145 0.28999805]
action :  2


get action
policy :  [0.3095467  0.18828888 0.15957494 0.05259145 0.28999805]
action :  4


get action
policy :  [0.3830008  0.11647435 0.20620337 0.05655083 0.23777066]
action :  4


get action
policy :  [0.3830008  0.11647435 0.20620337 0.05655083 0.23777066]
