# Agent 정의
* baseline 을 상속하거나
* 직접 구현한다

In [5]:
from core.common.agent import Agent
from core.common.util import *
from collections import deque
import random
import numpy as np

# Inherit Agent class as a parent class
class DeepSARSAgent(Agent):
    
    # Detailed description about input parameters see API Doc
    def __init__(self, action_size, model, load_model=False, discount_factor=0.99, learning_rate=0.001,
             epsilon=1, epsilon_decay=0.999, epsilon_min=0.01,
             file_path='', training_mode=True, **kwargs):
        
        # Call constructor of parent's class
        super(DeepSARSAgent, self).__init__(**kwargs)
        
        # Set parameters from inputs
        self.load_model = load_model
        self.action_size = action_size
        self.model = model
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.training_mode = training_mode
        self.file_path = file_path
        
        # Set the epsilon as minimum value, if not training mode.
        if not self.training_mode:
            self.epsilon = self.epsilon_min

        # memory for train (S,A,R,S',A')
        self.observations = deque(maxlen=2)
        self.recent_observation = None
        self.recent_action = None


        if self.load_model and os.path.isfile(file_path):
            self.load_weights(file_path)
    
   # Get an action to be taken from observation
    def forward(self, observation):
        
        # Take a random acton with probability = epsilon
        if self.training_mode and np.random.rand() <= self.epsilon:
            action = random.randrange(self.action_size)
        else:
        # Take a best acton with probability = (1 - epsilon)
            state = np.float32(observation)
            q_values = self.model.predict(np.expand_dims(state, 0))
            action = np.argmax(q_values[0])

        # set memory for training
        self.recent_observation = observation
        self.recent_action = action

        return [action]
        
    # Updates the agent's network
    def backward(self, reward, terminal):
        
        self.observations.append([self.recent_observation, self.recent_action, reward, terminal])

        if self.step == 0:
            return

        # Decaying the epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # Use a memory to train
        experience = self.observations.popleft()
        state = np.float32(experience[0])
        action = experience[1]
        reward = experience[2]
        done = experience[3]

        # Get next action on next state from current model
        next_state = np.float32(self.recent_observation)
        next_action = self.forward(next_state)

        # Compute Q values for target network update
        # Q(S,A) <- Q(S,A) + alpha(R + gammaQ(S',A') - Q(S,A))
        target = self.model.predict(np.expand_dims(state, 0))[0]
        if done:
            target[action] = reward
        else:
            target[action] = (reward + self.discount_factor *
                              self.model.predict(np.expand_dims(next_state, 0))[0][next_action])

        target = np.reshape(target, [1, self.action_size])

        self.model.fit(np.expand_dims(state, 0), target, epochs=1, verbose=0)
        return

    # Compile the model
    def compile(self, optimizer, metrics=[]):
        self.model.compile(optimizer=optimizer, loss='mse')
        return
    
    # Load trained weight from an HDF5 file.
    def load_weights(self, filepath) :
        self.model.load_weights(filepath)
        return

    # Save trained weight from an HDF5 file.
    def save_weights(self, filepath, overwrite):
        self.model.save_weights(filepath, overwrite)
        return

Using TensorFlow backend.


# Env 생성
* env 를 생성한다. 
* shared memory 방식 혹은 다른 방식으로 게임과 통신한다

In [2]:
import saida_gym.starcraft.avoidZerglings as gym

In [None]:
env = gym.AvoidZerglings()

Initialize...
Shared Memory create 
SAIDA_ZVZ26880 Shared memory found.


# 커스터마이징
* state, reward 커스텀
* Processor 모듈을 건들면 댐

In [1]:
from core.common.processor import Processor
import numpy as np
import math

class ReaverProcessor(Processor):

    # Process an performed action 
    def process_action(self, action):
        "do what you want with the action"
        return action

    # Process the data given from environment after the step finished.
    def process_step(self, observation, reward, done, info):
        state_array = self.process_observation(observation)
        reward = self.reward_reshape(reward)
        return state_array, reward, done, info

    # Reshape your reward (Optional)
    def reward_reshape(self, reward):
        # Maybe I can give more incentive to the agent, when the agent has reached the goal.
        if reward == 1 :  
            reward = reward * 2
        # And prevent to stay at safe start position rather then moving, give a small negative reward in every step.
        elif reward == -1 :
            reward  = -0.1
        
        return reward
    
    # Process the raw observation given from environment
    def process_observation(self, observation, **kwargs):
        # Raw observation data is the form of JSON(precisely, Protobuf).
        # Therefore, we need to transform it into an array or something can be calculated.
        
        # Define the size of state array. 
        # This time, I need 23 numbers to make the state, which consists of 5 factors of agent observation and 6 from 3 enemies one.
        STATE_SIZE = 5 + 3 * 6  
        s = np.zeros(STATE_SIZE) # Make an empty array.
        me = observation.my_unit[0] # Observation for Dropship (Agent)
        # Scale data set in order to learn fast and efficiently.
        s[0] = scale_pos(me.pos_x)  # X of coordinates
        s[1] = scale_pos(me.pos_y)  # Y of coordinates
        s[2] = scale_velocity(me.velocity_x)  # X of velocity
        s[3] = scale_velocity(me.velocity_y)  # y of coordinates
        s[4] = scale_angle(me.angle)  # Angle of head of dropship

        # Observation for Reavers(3 of them)
        for ind, ob in enumerate(observation.en_unit):
            s[ind * 6 + 5] = scale_pos(ob.pos_x - me.pos_x)  # X of relative coordinates
            s[ind * 6 + 6] = scale_pos(ob.pos_y - me.pos_y)  # Y of relative coordinates
            s[ind * 6 + 7] = scale_velocity(ob.velocity_x)  # X of velocity
            s[ind * 6 + 8] = scale_velocity(ob.velocity_y)  # Y of velocity
            s[ind * 6 + 9] = scale_angle(ob.angle)  # Angle of head of Reavers
            s[ind * 6 + 10] = scale_angle(1 if ob.accelerating else 0)  # True if Reaver is accelerating

        return s
    
    @staticmethod
    def scale_velocity(v):
        return v

    @staticmethod
    def scale_angle(angle):
        return (angle - math.pi) / math.pi

    @staticmethod
    def scale_pos(pos):
        return int(pos / 16)
        

# 그 다음 할 것들..
* 신경망 만들고 인자로 넘겨주기
* 학습시키기