In [1]:
import pickle
from agents.nnAgent import nnModel, train_test_splitter
from agents.policy_gradient import PolicyModelAgent
from agents.rule_based_agent import RuleBasedAgent
from agents.greedy_agent import GreedyAgent
import tensorflow as tf
from helpers import *

Loading environment football failed: No module named 'gfootball'


In [2]:
def compute_advantage(data, model):
    cur_state_val = model.predict(data['state'])
    next_state_val = np.array(data['reward']).reshape(-1, 1) + 0.85*model.predict(data['next_state']).reshape(-1, 1)*(np.array(data['done']) == False).reshape(-1, 1)
    return (next_state_val - cur_state_val).reshape(-1, 1)

def compute_target_v(data, model):
    next_state_val = np.array(data['reward']).reshape(-1, 1) + 0.85*model.predict(data['next_state']).reshape(-1, 1)*(np.array(data['done']) == False).reshape(-1, 1)
    return next_state_val.tolist()

In [3]:
tf.compat.v1.reset_default_graph()

policy_model = nnModel('policy')
policy_model.build_model(3)

v_model1 = nnModel('state_value')
v_model1.build_model(3)

v_model2 = nnModel('state_value')
v_model2.build_model(3)

actor_critic = nnModel('actor_critic')
actor_critic.build_model(3)

In [4]:
policy_model.load('rule_based_policy')
v_model1.load('state_value')
v_model2.load('state_value')

<tensorflow.python.keras.engine.functional.Functional at 0x7f18b6f3d670>

In [5]:
# Starting point - weights from model trained on predicting rule-based actions
actor_critic.replace_weights(policy_model)


In [6]:
nb_passes = 5
nb_files = 40
nb_epochs_reinforce = 5
nb_epochs_v = 5
batch_size = 32

reload_weights = 5
reload_it = 1

In [7]:
env = make("hungry_geese", debug=True)
config = env.configuration

In [10]:
step_reward = 1
winning_reward = 10
losing_reward = -10
discount = 0.85


def discounted(discount_factor, nb_steps, step_reward):
    discounted_reward = 0
    for _ in range(int(nb_steps)):
        discounted_reward = step_reward + discount_factor*discounted_reward
    return discounted_reward

step_200_reward = lambda my_goose, longuest_opponent: winning_reward if my_goose > longuest_opponent else 3*losing_reward
win_game_reward = lambda step, my_goose, longuest_opponent: winning_reward + discounted(discount, 200-step, step_reward) #max((200-step), winning_reward)

nb_opponents = 3

steps_per_ep = 200
num_episodes = 100
nb_updates = 1000

In [11]:
for it in range(nb_updates):
    print(f'starting update {it}')
    episodes = []
    nb_wins = 0
    for ep in range(num_episodes):
        print('episode number: ', ep)
        steps = []
        my_agent = PolicyModelAgent(actor_critic)
        agents =  [my_agent] + [(RuleBasedAgent() if np.random.rand()<1 else GreedyAgent()) for _ in range(nb_opponents)]
        state_dict = env.reset(num_agents=nb_opponents + 1)[0]
        observation = state_dict['observation']
        my_goose_ind = observation['index']

        reward = state_dict['reward']
        action = state_dict['action']



        done = False
        for step in range(1, steps_per_ep):
            actions = []

            for i, agent in enumerate(agents):
                obs = deepcopy(observation)
                obs['index'] = i
                action = agent(obs, config)
                actions.append(action)

            state_dict = env.step(actions)[0]
            observation = state_dict['observation']
            my_goose_ind = observation['index']

            my_goose_length = len(observation['geese'][my_goose_ind])

            longuest_opponent=0
            for i, goose in enumerate(observation.geese):
                if i != my_goose_ind:
                    opponent_length = len(goose)
                    if opponent_length > longuest_opponent:
                        longuest_opponent = opponent_length

            #new_state, _, _ = agent.getStateSpace(observation, config)

            #reward = state_dict['reward']
            action = state_dict['action']
            status = state_dict['status']

            if status != "ACTIVE":
                done = True

            # Check if my goose died
            if my_goose_length == 0:
                done = True
                reward = losing_reward
            elif (step+1) == steps_per_ep:
                reward = step_200_reward(my_goose_length, longuest_opponent)
                done = True
                if my_goose_length > longuest_opponent:
                    nb_wins += 1
            elif status != "ACTIVE":
                reward = win_game_reward(step, my_goose_length, longuest_opponent)
                nb_wins += 1
            else:
                reward = step_reward

            steps.append({'cur_state': my_agent.stateSpace,
                                    'action': action,
                                    'reward': reward,
                                    'new_state': '',#new_state,
                                    'status': status,
                                    'done': done})
            if done:
    #                 print('Done, Step: ', step+1)
    #                 print('status, ', status)
                break

            if step%50 == 0:
                pass
                #print(f'We survived {step+1} steps')       
        episodes.append(steps)
    print(f'won {100*float(nb_wins)/num_episodes}% games') 
    process(discount, episodes)
    data = training_data(episodes)

    # training models
    X = data['state']
    y = data['y']
    v = compute_target_v(data, v_model2)
    X_train, X_test, y_train, y_test, v_train, v_test = train_test_splitter(X, y, 0.05, v=v)

    advantage = compute_advantage(data, v_model2)

    nb_samples = y.shape[0]
    shuffled = np.random.choice(nb_samples, nb_samples, replace=False)
    new_X = [col[shuffled] for col in X+[advantage]]
    
    actor_critic.fit(new_X,
                     y[shuffled],
                     X_test=None,
                     y_test=None,
                     epoch=nb_epochs_reinforce,
                     batch_size=batch_size)

    actor_critic.save('actor_critic')
    actor_critic.save_weights('actor_critic')

    v_model1.fit(X_train,
                 v_train,
                 X_test,
                 v_test,
                 epoch=nb_epochs_v,
                 batch_size=batch_size)

    v_model1.save('v_model_temp_diff')
    if reload_it%reload_weights == 0:
        v_model2.replace_weights(v_model1)
    reload_it+=1

starting update 0
episode number:  0
Opposite action: (0, <Action.NORTH: 1>, <Action.SOUTH: 3>)
episode number:  1
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  2
Goose Collision: NORTH
Goose Collision: WEST
episode number:  3
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  4
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  5
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  6
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  7
Opposite action: (0, <Action.NORTH: 1>, <Action.SOUTH: 3>)
episode number:  8
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  9
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
won 0.0% games
Train on 174 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: /home/charles/PycharmProjects/HungryGeese/models/actor_critic/assets
Train on 165 samples
Epoch 1/5



Validation r2: -0.34129225422074305
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: /home/charles/PycharmProjects/HungryGeese/models/v_model_temp_diff/assets
starting update 1
episode number:  0
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  1
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  2
Goose Collision: WEST
Goose Collision: NORTH
episode number:  3
Body Hit: (3, <Action.NORTH: 1>, 21, [32, 31, 20, 21])
Goose Collision: SOUTH
episode number:  4
Goose Collision: EAST
Opposite action: (0, <Action.EAST: 2>, <Action.WEST: 4>)
episode number:  5
Opposite action: (0, <Action.WEST: 4>, <Action.EAST: 2>)
episode number:  6
Goose Collision: NORTH
Goose Collision: EAST
episode number:  7
Opposite action: (0, <Action.NORTH: 1>, <Action.SOUTH: 3>)
episode number:  8
Body Hit: (1, <Action.EAST: 2>, 7, [6, 17, 16, 15, 4, 5, 71, 72, 73, 7])
episode number:  9
Goose Collision: SOUTH
Goose Collision: WEST
won 10.0% ga

KeyboardInterrupt: 

In [None]:
# actor_critic.load_weights('actor_critic')
# env = make("hungry_geese", debug = True)
# my_agent = PolicyModelAgent(actor_critic, True)
# env.run([my_agent] + ["greedy" for i in range(3)])
# env.render(mode="ipython", width=600, height=650)