In [4]:
#Import Libraries:

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

from rl_glue import RLGlue

from SnakeGameEnv import SnakeEnv

from copy import deepcopy

import os

import shutil

from SmartSnake import BaseSnake


In [5]:
class ActionValueNetwork:
    def __init__(self, network_config):
        self.state_dim = network_config.get("state_dim")
        self.num_hidden_units = network_config.get("num_hidden_units")
        self.num_actions = network_config.get("num_actions")

        self.layer_sizes = np.array([self.state_dim,self.num_hidden_units,self.num_actions])

        self.rand_generator = np.random.RandomState(network_config.get("seed"))
        
        self.weights = [dict() for i in range(0, len(self.layer_sizes) - 1)]

        for i in range(0,len(self.layer_sizes) - 1):
            ins, out = self.layer_sizes[i], self.layer_sizes[i + 1]
            self.weights[i]['W'] = self.init_saxe(ins, out)
            self.weights[i]['b'] = np.zeros((1,out))
    def get_action_values(self,s):
        W0, b0 = self.weights[0]['W'], self.weights[0]['b']

        psi = np.dot(s,W0) + b0
        x = np.max(psi,0)

        W1, b1 = self.weights[1]['W'], self.weights[1]['b']
        q_values = np.dot(x,W1) + b1

        return q_values
    def td_update(self, s, delta_mat):
        W0,b0 = self.weights[0]['W'], self.weights[0]['b']
        W1,b1 = self.weights[1]['W'], self.weights[1]['b']

        psi = np.dot(s, W0) + b0
        x = np.maximum(psi, 0)
        dx = (psi > 0).astype(float)

        td_update = [dict() for i in nrange(len(self.weights))]

        v = delta_mat
        td_update[1]['W'] = np.dot(x.T , v) * 1. / s.shape[0]
        td_update[1]['b'] = np.sum(v, axis = 0, keepdims=True) * 1. / s.shape[0]

        v = np.dot(v, W1.T) * dx
        td_update[0]['W'] = np.dot(s.T, v) * 1. / s.shape[0]
        td_update[0]['b'] = np.sum(v, axis = 0, keepdims=True) * 1./ s.shape[0]

        return td_update

    #The nonlinear dy
    def init_saxe(self,rows,cols):
        tensor = self.rand_generator.normal(0,1, (rows,cols))
        if rows < cols:
            tensor = tensor.T
        #The np.linalg.qr solve the least square problem
        tensor, r = np.linalg.qr(tensor)
        
        d = np.diag(r, 0)

        ph = np.sign(d)

        tensor *= ph

        if rows < cols:
            tensor = tensor.T
        return tensor
    def get_weights(self):
        return deepcopy(self.weights)
        
    def set_weights(self, weights):
        self.weights = deepcopy(weights)


In [6]:
#Checking the result of layer sizes
network_config = {
    "state_dim": 5,
    "num_hidden_units":20,
    "num_actions": 3
}
test_network = ActionValueNetwork(network_config)
print(test_network.layer_sizes)

[ 5 20  3]


In [7]:
#Adam Optimizer 
class Adam():
    def __init__(self, layer_sizes, optimizer_info):
        '''
            optimizer_info = {
                step_size = None
                beta_m = None
                beta_v = None
                epsilon = None
            }

        '''
        self.layer_sizes = layer_sizes

        self.step_size = optimizer_info.get("step_size")
        self.beta_m = optimizer_info.get("beta_m")
        self.beta_v = optimizer_info.get("beta_v")
        self.epsilon = optimizer_info.get("epsilon")

        self.m = [dict() for i in range(1 , len(self.layer_sizes))]
        self.v = [dict() for i in range(1 , len(self.layer_sizes))]

        for i in range(0, len(self.layer_sizes) - 1):
            ins, out = self.layer_sizes[i], self.layer_sizes[i + 1]
            self.m[i]['W'] = np.zeros((ins, out))
            self.m[i]['b'] = np.zeros((1,out))
            self.v[i]['W'] = np.zeros((ins, out))
            self.v[i]['b'] = np.zeros((1,out))

        self.beta_update_m = self.m
        self.beta_update_v = self.v

    def adam_optimize(self,weights, grads):
        for i in range(0,len(self.layer_sizes) - 1):
            for param in self.weights[i].keys():
                self.m[i][param] = self.beta_m * self.m[i][param] + (1 - self.beta_m) * grads[i][param]
                self.v[i][param] = self.beta_v * self.v[i][param] + (1 - self.beta_v) * grads[i][param]**2

                m_hat = self.m[i][param] / (1 - self.beta_update_m)
                v_hat = self.m[i][param] / (1 - self.beta_update_v)

                weights[i][param]+= self.step_size * m_hat / (np.sqrt(v_hat) + self.epsilon)

        self.beta_update_m*=self.beta_m
        self.beta_update_v*=self.beta_v

        return weights
        

In [8]:
#Softmax Policy
def softmax(action_val,tau = 1.0):
    preferences = action_val / tau

    max_pref = np.max(preferences)

    max_pref = np.reshape(max_pref,(-1,1))

    numerator = np.exp(preferences - max_pref)

    denominator = np.sum(np.exp(preferences - max_pref),axis = 1)

    denominator = np.reshape(denominator,(-1,1))

    probs = numerator / denominator

    return probs

In [9]:
#Checking Softmax Policy

tau = 0.5
action_val = np.random.normal(3,1,size = (2,3))
print(action_val)
sm = softmax(action_val,tau)

print(sm)

[[2.28379663 3.70149596 2.49853409]
 [3.25031134 3.26791286 1.95532675]]
[[0.05108911 0.87041499 0.0784959 ]
 [0.47374228 0.49071646 0.03554126]]


In [10]:
def Get_TD_error(states,next_states, actions, reward, discount, terminals, network, current_q, tau):
    next_mat = current_q.get_state_values(next_states)

    policy = softmax(next_mat, tau)

    v_next_vec = np.sum(next_mat * policy,axis = 1) * (1 - terminals)

    target = reward - discount * v_next_vec

    cur_mat = network.get_state_values(states)

    batch_indices = np.arange(cur_mat.shape[0])

    q_vec = cur_mat[batch_indices,actions]

    delta_vec = target - q_vec

    return delta_vec

In [11]:
class ReplayBuffer:
    def __init__(self, size, minibatch_size, seed):
        self.buffer = []
        self.rand_generator = np.random.RandomState(seed)
        self.minibatch_size = minibatch_size
        self.max_size = size
    def append(self,state,action,reward,terminal, next_state):

        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state,action,reward,terminal,next_state])

    def sample(self):

        index = self.rand_generator.choice(np.arange(len(self.buffer)), size = self.minibatch_size)
        return [self.buffer[i] for i in index]

    def size(seed):
        return len(self.buffer)

In [12]:
def optimize_network(experiences,discount,optimizer,network,current_q,tau):
    states, actions, rewards, terminals, next_states = map(list, zip(*experiences))
    
    states = np.concatenate(states)
    next_states = np.concatenate(next_states)
    rewards = np.array(rewards)
    terminals = np.array(terminals)
    batch_size = states.shape[0]

    delta_vec = Get_TD_error(states,next_states,actions,rewards,discount,terminals,network,current_q,tau)

    batch_indices = np.arange(batch_size)

    delta_mat = np.zeros((batch_size, network.num_actions))
    delta_mat[batch_indices, actions] = delta_vec

    td_update = network.get_TD_update(states,delta_mat)

    weights = optimizer.update_weights(network.get_weights(),td_update)

    network.set_weights(weights)
    

In [13]:
class Agent(BaseSnake):
    def __init__(self):
        self.name = "Expected Sarsa Agent"
    def agent_init(self, agent_config):
        self.replay_buffer = ReplayBuffer(agent_config['buffer_size'],agent_config['buffer_minibatch_size'],agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        self.optimize = Adam(self.layer_sizes, agent_config['optimizer_config'])
        self.tau = agent_config['tau']
        self.discount = agent_config['gamma']
        self.num_replay = agent_config['num_replay']
        self.num_actions = agent_config['network_config']['num_actions']

        self.rand_generator = np.random.RandomState(agent_config.get('seed'))

        self.sum_reward = 0
        self.episode_steps = 0

        self.last_action = None
        self.last_state = None

    def policy(self,state):
        action_value = self.network.get_action_values(state)
        probs_batch = softmax(action_value,self.tau)
        action = self.rand_generator.choice(self.num_actions, p=probs_batch.squeeze())
        
        return action
    
    def agent_start(self,state):
        self.sum_reward = 0
        self.episode_steps = 0

        self.last_state = np.array([state])
        self.last_action = self.policy(self.last_state)
        
        return self.last_action

    def agent_step(self,reward,state):

        self.sum_reward+=reward
        self.episode_steps+=1

        state = np.array([state])

        action = self.policy(state)
        
        self.replay_buffer.append(self.last_state,self.last_action,reward,0,state)

        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            current_q = deepcopy(self.network)
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()

                optimize_network(experiences,self.discount, self.optimizer, self.network, current_q, self.tau)


        self.last_action = action
        self.last_state = state

    def agent_end(self,reward):
        self.sum_reward+=reward
        self.episode_steps+=1

        state = np.array([self.last_state])
        action = self.policy(state)

        self.replay_buffer.append(self.last_state,self.last_action,reward,1,state)

        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            current_q = deepcopy(self.network)
            for _ in range(self.num_replay):

                experiences = self.replay_buffer.sample()

                optimize_network(experiences, self.discount, self.optimizer, self.network, current_q, self.tau)

    def agent_message(self,message):
        if message == 'get_sum_reward':
            return self.sum_reward
        else: raise Exception('Unrecognize Message')



In [14]:
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    
    rl_glue = RLGlue(environment, agent)

    agent_sum_reward = np.zeros((experiment_parameters['num_runs'],experiment_parameters['num_episodes']))

    env_info = {}

    agent_info = agent_parameters

    for run in range(1, experiment_parameters['num_runs'] + 1):
        agent_info['seed'] = run
        agent_info['network_config']['seed'] = run
        env_info['seed'] = run
        
        rl_glue.rl_init(agent_info, env_info)

        for episode in tqdm(range(1, experiment_parameters['replay'])):

            rl_glue.rl_episode(experiment_parameters['timeout'])

            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run - 1, episode - 1] = episode_reward

    save_name = "{}".format(rl_glue.agent.name)
    if not os.path.exists('results'):
        os.makedirs('results')
    np.save('results/sum_reward_{}'.format(save_name),agent_info)
    shutil.make_archive('results','zip','results')



In [17]:
experiment_parameters = {
    'num_runs': 1,
    'num_episodes':300,
    'timeout':1000
}

environment_parameters = {
    
}

current_env = SnakeEnv

agent_parameters = {
    'network_config':{
        'state_dim':8,
        'num_hidden_units':256,
        'num_actions':4,
    },
    'optimizer_config':{
        'step_size':1e-3,
        'beta_m':0.9,
        'beta_v':0.999,
        'epsilon': 1e-8,
    },
    'buffer_size': 50000,
    'num_replay': 8,
    'minibatch_size': 4,
    'gamma': 0.99,
    'epsilon': 0.001
}

current_agent = Agent

run_experiment(current_env,current_agent,environment_parameters, agent_parameters,experiment_parameters)

training_NN = run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters)

TypeError: env_init() takes 1 positional argument but 2 were given