In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

from pathlib import Path
import os

from tf_agents.drivers import dynamic_episode_driver
from tf_agents.eval import metric_utils
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.policies import policy_saver
from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import q_network

from tf_agents.metrics import tf_metrics

import gym
import gfootball

from MyGFootballEnv import MyFootballEnv
from MyGFootballEnv import data_dic

from GFHumanPolicy import MyFootbalHumanPyPolicy

tf.compat.v1.enable_v2_behavior()
tf.random.set_seed(12)
tf.print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [2]:
cfg = ['ball_owned_team', 
       'steps_left', 
       'ball_owned_player', 
       'game_mode', 
       #'designated', 
       'active', 
       #'left_team_active', 
       'left_team', 
       #'right_team_active', 
       'ball_direction', 
       'ball', 
       #'left_team_tired_factor', 
       'left_team_direction', 
       'score', 
       #'left_team_roles', 
       #'right_team_tired_factor', 
       'right_team', 
       'right_team_direction', 
       #'right_team_yellow_card', 
       #'right_team_roles', 
       'ball_rotation', 
       #'left_team_yellow_card', 
       'sticky_actions']

In [3]:
train_env = tf_py_environment.TFPyEnvironment(MyFootballEnv(cfg, scenario='1_vs_1_easy'))
tf_env_eval = tf_py_environment.TFPyEnvironment(MyFootballEnv(cfg, scenario='1_vs_1_easy'))

In [4]:
print('action_spec:', train_env.action_spec())
print('time_step_spec.observation:', train_env.time_step_spec().observation)
print('time_step_spec.step_type:', train_env.time_step_spec().step_type)
print('time_step_spec.discount:', train_env.time_step_spec().discount)
print('time_step_spec.reward:', train_env.time_step_spec().reward)

action_spec: BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(18, dtype=int32))
time_step_spec.observation: {'ball_owned_team': BoundedTensorSpec(shape=(1,), dtype=tf.int32, name='ball_owned_team', minimum=array(0, dtype=int32), maximum=array(2, dtype=int32)), 'steps_left': BoundedTensorSpec(shape=(1,), dtype=tf.float32, name='steps_left', minimum=array(0., dtype=float32), maximum=array(3.4028235e+38, dtype=float32)), 'ball_owned_player': BoundedTensorSpec(shape=(1,), dtype=tf.int32, name='ball_owned_player', minimum=array(0, dtype=int32), maximum=array(11, dtype=int32)), 'game_mode': BoundedTensorSpec(shape=(1,), dtype=tf.int32, name='game_mode', minimum=array(0, dtype=int32), maximum=array(6, dtype=int32)), 'active': BoundedTensorSpec(shape=(1,), dtype=tf.int32, name='active', minimum=array(-1, dtype=int32), maximum=array(11, dtype=int32)), 'left_team': BoundedTensorSpec(shape=(11, 2), dtype=tf.float32, name='left_team', minimum=

In [5]:
import time
class learningHelper:
    def __init__(self, train_env, test_env, agent, global_step, chkpdir='./',
        num_iterations=20000, collect_episodes=100, collect_steps_per_iteration=2,
        replay_buffer_capacity=20000, batch_size=64, log_interval=500, 
        num_eval_episodes=10, eval_interval = 5000, IsAutoStoreCheckpoint = True, collect_policy = None):

        if collect_policy is None:
            self.collect_policy = self.agent.collect_policy
            print('selected agent collect_policy')
        else:
            self.collect_policy = collect_policy
            print('selected USER collect_policy')
        
        tf.compat.v1.enable_v2_behavior()
        
        self.IsAutoStoreCheckpoint = IsAutoStoreCheckpoint
        self.num_iterations = num_iterations
        self.collect_episodes = collect_episodes
        self.collect_steps_per_iteration = collect_steps_per_iteration
        self.replay_buffer_capacity = replay_buffer_capacity

        self.batch_size = batch_size
        self.log_interval = log_interval

        self.num_eval_episodes = num_eval_episodes
        self.eval_interval = eval_interval
        
        self.agent = agent

        self.train_env = train_env
        self.test_env = test_env

        self.global_step = global_step

        #create reply buffer for collection trajactories
        self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.replay_buffer_capacity)

        #Checkpointer
        self.checkpoint_dir = os.path.join(chkpdir, 'checkpoint')
        Path(self.checkpoint_dir).mkdir(parents=True, exist_ok=True)
        self.policy_dir = os.path.join(chkpdir, 'policy')
        Path(self.policy_dir).mkdir(parents=True, exist_ok=True)

        self.train_checkpointer = common.Checkpointer(
            ckpt_dir=self.checkpoint_dir,
            max_to_keep=1,
            agent=self.agent,
            policy=self.agent.policy,
            replay_buffer=self.replay_buffer,
            global_step=self.global_step
            )

        self.tf_policy_saver = policy_saver.PolicySaver(self.agent.policy)
        
        self.local_step_counter = 0

        pass

    def evaluate_agent(self, n_episodes=100):
        #define metrics
        num_episodes = tf_metrics.NumberOfEpisodes()
        env_steps = tf_metrics.EnvironmentSteps()
        average_return = tf_metrics.AverageReturnMetric()
        #rew = TFSumOfRewards()

        #add reply buffer and metrict to the observer
        observers = [num_episodes, env_steps, average_return ]

        _driver = dynamic_episode_driver.DynamicEpisodeDriver(self.test_env, self.agent.policy, observers, num_episodes=n_episodes)

        final_time_step, _ = _driver.run()

        print('eval episodes = {0}: Average Return = {1}'.format(num_episodes.result().numpy(), average_return.result().numpy()))
        return average_return.result().numpy()

    def collect_training_data(self, verbose=0):

        if(verbose > 0):
            #define metrics
            num_episodes = tf_metrics.NumberOfEpisodes()
            env_steps = tf_metrics.EnvironmentSteps()
            #add reply buffer and metrict to the observer
            observers = [self.replay_buffer.add_batch, num_episodes, env_steps]
        else:
            observers = [self.replay_buffer.add_batch]

        self.replay_buffer.clear()
        #create a driver
        #we can create a driver using e.g. random policy
        driver = dynamic_episode_driver.DynamicEpisodeDriver(
            self.train_env, self.collect_policy, observers, num_episodes=self.collect_episodes)

        #collect_steps_per_iteration = 2
        #driver = dynamic_step_driver.DynamicStepDriver(
        #    train_env, tf_policy, observers, num_steps=collect_steps_per_iteration)

        # Initial driver.run will reset the environment and initialize the policy.
        final_time_step, policy_state = driver.run()
        if(verbose > 0):
            #print('final_time_step', final_time_step)
            print('Number of Steps: ', env_steps.result().numpy())
            print('Number of Episodes: ', num_episodes.result().numpy())

        pass 

    def train_step(self, n_steps):
        # Convert the replay buffer to a tf.data.Dataset 
        # Dataset generates trajectories with shape [Bx2x...]
        AUTOTUNE = tf.data.experimental.AUTOTUNE
        dataset = self.replay_buffer.as_dataset(
            num_parallel_calls=AUTOTUNE, 
            sample_batch_size=self.batch_size, 
            num_steps=2).prefetch(AUTOTUNE)

        iterator = iter(dataset)

        train_loss = None
        #experience = self.replay_buffer.gather_all()
        #train_loss = self.agent.train(experience) 
        for _ in range(n_steps):
            # Sample a batch of data from the buffer and update the agent's network.
            experience, unused_info = next(iterator)
            train_loss = self.agent.train(experience)            
        
        print('Global steps {}: Traning Loss {}'.format(self.global_step.numpy(), train_loss.loss))

    def train_agent(self, n_epoch):
        local_epoch_counter = 0
        for i in range(n_epoch):
            start_time = time.time()
            self.collect_training_data(verbose=0)
            #print('num_frames()', self.replay_buffer.num_frames().numpy())
            #print('n_steps()', int(self.replay_buffer.num_frames().numpy()/self.batch_size))
            self.train_step(self.replay_buffer.num_frames().numpy())
            
            if(self.IsAutoStoreCheckpoint == True):
                self.store_check_point()
            epoch_train_time = time.time() - start_time
            local_epoch_counter = local_epoch_counter + 1
            print('Epoch: {}, epoch train time: {}'.format( local_epoch_counter, epoch_train_time ))
        pass

    def train_agent_with_avg_ret_condition(self, max_steps, min_avg_return, n_eval_steps=100):
        for i in range(max_steps):
            self.collect_training_data()
            self.train_step(1000)
            if(self.IsAutoStoreCheckpoint == True):
                self.store_check_point()
            
            if ((i>0) and (i % self.eval_interval) == 0):
                avg_ret = self.evaluate_agent(n_eval_steps)
                if(avg_ret > min_avg_return):
                    return
        pass

    def get_agent(self):
        return self.agent

    def store_check_point(self):
        self.train_checkpointer.save(self.global_step)
        pass
    def restore_check_point(self):
        self.train_checkpointer.initialize_or_restore()
        self.global_step = tf.compat.v1.train.get_global_step()
        pass
    def save_policy(self):
        self.tf_policy_saver.save(self.policy_dir)
        pass

In [6]:
fc_layer_params = (100,50)

preprocessing_layers_d = {
    'left_team_yellow_card': tf.keras.layers.Dense(11),
    'left_team_roles': tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=11),#tf.keras.models.Sequential([tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=10),
                        #                           tf.keras.layers.Flatten()]),

    'ball_direction': tf.keras.layers.Dense(3),
    'left_team_tired_factor': tf.keras.layers.Dense(11),
    'left_team_active': tf.keras.layers.Dense(11),
    'right_team_tired_factor': tf.keras.layers.Dense(11),
    'ball': tf.keras.layers.Dense(3),
    'ball_owned_player': tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=12),#tf.keras.models.Sequential([tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=12),
                                                   #tf.keras.layers.Flatten()]),
    'ball_rotation': tf.keras.layers.Dense(1),
    'right_team_active': tf.keras.layers.Dense(11), 
    'game_mode': tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=7),#tf.keras.models.Sequential([tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=7),
                                                   #tf.keras.layers.Flatten()]),
    'steps_left': tf.keras.layers.Dense(1),
    'right_team': tf.keras.layers.Flatten(),
    'right_team_yellow_card': tf.keras.layers.Dense(11),
    'left_team': tf.keras.layers.Flatten(),
    'ball_owned_team': tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=3),#tf.keras.models.Sequential([tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=3),
                                                  #tf.keras.layers.Flatten()]),
    'score': tf.keras.layers.Dense(2),
    'right_team_roles': tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=11), #tf.keras.models.Sequential([tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=12),
                                                   #tf.keras.layers.Flatten()]),
    'right_team_direction': tf.keras.layers.Flatten(),
    'left_team_direction': tf.keras.layers.Flatten(),
    'designated': tf.keras.layers.Dense(1),
    'active': tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=12),#tf.keras.models.Sequential([tf.keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=12),
                                                   #tf.keras.layers.Flatten()]),
    'sticky_actions': tf.keras.layers.Dense(10),
    }

preprocessing_layers = {}
for k in cfg:
    preprocessing_layers[k] = preprocessing_layers_d[k]

preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    preprocessing_layers=preprocessing_layers,
    preprocessing_combiner=preprocessing_combiner,
    fc_layer_params=fc_layer_params)

In [7]:
learning_rate = 1e-3  

#create optimizer
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

#create a global step coubter
#train_step_counter = tf.Variable(0)
global_step = tf.compat.v1.train.get_or_create_global_step()

#create agent
agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=global_step,
    target_update_period = 10
)

agent.initialize()

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)


In [8]:
a_spec = train_env.action_spec()
t_spec = train_env.time_step_spec()
mp = MyFootbalHumanPyPolicy(time_step_spec=t_spec, action_spec=a_spec)

In [9]:
magent = learningHelper(train_env=train_env, test_env=tf_env_eval, agent=agent, global_step=global_step, 
                        collect_episodes = 3,
                        eval_interval=5,
                        replay_buffer_capacity=3500,
                        batch_size=500,
                        collect_policy = mp
)
magent.restore_check_point()

selected USER collect_policy


In [10]:
%%time
magent.train_agent(1)
#magent.train_agent_with_avg_ret_condition(10, 10000, 100)
magent.save_policy()

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Global steps 313248: Traning Loss 38602.86328125
Epoch: 1, epoch train time: 26.95299220085144
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: ./policy/assets


INFO:tensorflow:Assets written to: ./policy/assets


CPU times: user 42.6 s, sys: 4.5 s, total: 47.1 s
Wall time: 27.8 s


In [11]:
avg_ret = magent.evaluate_agent(1)
print(avg_ret)

eval episodes = 1: Average Return = 0.0
0.0


In [12]:
#!pip3 install kaggle_environments

In [13]:
%%writefile submission.py
from kaggle_environments.envs.football.helpers import *

import tensorflow as tf
import numpy as np
from tf_agents.trajectories import time_step as ts

from MyGFootballEnv import MyFootballEnv
from MyGFootballEnv import data_dic


def convert_obs_to_tf_env_obs(obs):
    cfg = ['ball_owned_team', 
       'steps_left', 
       'ball_owned_player', 
       'game_mode', 
       #'designated', 
       'active', 
       #'left_team_active', 
       'left_team', 
       #'right_team_active', 
       'ball_direction', 
       'ball', 
       #'left_team_tired_factor', 
       'left_team_direction', 
       'score', 
       #'left_team_roles', 
       #'right_team_tired_factor', 
       'right_team', 
       'right_team_direction', 
       #'right_team_yellow_card', 
       #'right_team_roles', 
       'ball_rotation', 
       #'left_team_yellow_card', 
       'sticky_actions']
    
    o = MyFootballEnv.convert_observation_to_tf(obs['players_raw'], data_dic, cfg )
    for key, value in o.items():
        o[key] = tf.expand_dims(tf.convert_to_tensor(value), axis=0)

    return o

policy_dir = './policy/'
saved_policy = tf.compat.v2.saved_model.load(policy_dir)



def agent(obs):
    global saved_policy
    
    st = tf.constant(np.array(np.array([0], dtype=np.int32)))
    rw = tf.constant(np.array(np.array([0], dtype=np.float32)))
    ds = tf.constant(np.array(np.array([0], dtype=np.float32)))
    ts_obs = {}
    ts_obs = convert_obs_to_tf_env_obs(obs)
    t = ts.TimeStep(st, rw, ds, ts_obs)
    
    a = saved_policy.action(t).action.numpy()[0]
    
    return [a]


Overwriting submission.py


In [14]:
# Set up the Environment.
from kaggle_environments import make
env = make("football", debug=True, configuration={"save_video": True, "scenario_name": "1_vs_1_easy", 
                                                  "running_in_notebook": True, "episodeSteps": 200})
output = env.run(["do_nothing","./submission.py"])[-1]
print('Left player: reward = %s, status = %s, info = %s' % (output[0]['reward'], output[0]['status'], output[0]['info']))
print('Right player: reward = %s, status = %s, info = %s' % (output[1]['reward'], output[1]['status'], output[1]['info']))
env.render(mode="human", width=800, height=600)

Staring a new environment b450b556-860a-4608-8792-8d39c1c609df: with scenario: 1_vs_1_easy
Resetting environment b450b556-860a-4608-8792-8d39c1c609df: with scenario: 1_vs_1_easy
Left player: reward = 0, status = DONE, info = {}
Right player: reward = 0, status = DONE, info = {}
Received video link.
