# Collect training data from episodes

The next step beyond heuristics and careful mathematics is to train a machine learning model to choose which lever we should pull.  This notebook shows how to pull some basic training features from matches run locally using the kaggle_environments package.


In [1]:
import pandas as pd
from kaggle_environments import make
import os
import json
from tqdm.notebook import tqdm
import numpy as np
import gc 
from itertools import groupby
import pickle
import math

def encode_list(s_list):
    return max([sum(1 for _ in group) for _, group in groupby(s_list)])

def use_this_match(replay, max_ = 100, length_match = 2000):
    num_round = len(replay)
    
    action_list_0 = [data['steps'][x][0]['action'] for x in range(num_round)]
    action_list_1 = [data['steps'][x][1]['action'] for x in range(num_round)]
    max_repeat = max([encode_list(action_list_0), encode_list(action_list_1)])
    
    if (max_repeat < max_) & (num_round == length_match):
        return True 
    else:
        return False
    
agent_dir = '../input/agent-scrape-everything-step-2'
record = os.listdir(agent_dir)
record = [x for x in record if (x != 'winner_dic.pkl') & ('.json' in x)]

with open(os.path.join(agent_dir, 'winner_dic.pkl'), 'rb') as file:
    winner_dic = pickle.load(file)

episode_agents = pd.read_pickle('../input/mapping-sub-score/episode_agents.pkl')
last_submission_update = pd.read_pickle('../input/mapping-sub-score/last_submission_update.pkl')

Loading environment football failed: No module named 'gfootball'


# Generate example data

In [2]:
def log_training(result, winner, n_machines = 100, decay_rate = .97):
    """Records training data from each machine, each agent, each round
    
    Generates a training dataset to support prediction of the current
    payout ratio for a given machine.
    
    Args:
       result ([[dict]]) - output from all rounds provided as output of 
                           env.run([agent1, agent2])
       n_machines (int) - number of machines
                           
    Returns:
       training_data (pd.DataFrame) - training data, including:
           "n_pulls_self"   : number of pulls on this machine so far by agent_id
           "n_success_self" : number of rewards from this machine by agent_id
           "n_failure_self" : number of failure from this machine by agent_id
           "discounted_cumulative_success": number of discounted cumulative success weighted by number of n_pulls_tot
           "discounted_total_success": n_success_self * discount ** n_pulls_tot
           "n_pulls_opp"    : number of pulls on this machine by the other player
           "n_pulls_tot"    : total number of pulls on this machine
           "ratio_self"     : n_success_self/n_pulls_self
           "ratio_opp"      : n_pulls_opp/(round_num +1)
           "est_1"          : win - loss
           "est_2"          : win - loss + opp - (opp > 0)*1.5
           "repeat_opp"       : action has been repeated for n turn by enemy
           "n_pulls_self_last_10": number of times i selected this machine last 10 turn
           "n_pulls_opp_last_10": number of times opp selected this machine last 10 turn
           "ratio_self_last_10_selected": ratio of reward in last 10 times i select the machine
           "payout"         : actual payout ratio for this machine
    
    """    
    decay_rate = .97
    num_total_round = len(result) - 1
    agent_num = int(winner)
    num_total_row = (num_total_round) * 2
    col_df = [
                'round_num', 'n_pulls_self', 'n_success_self', 'n_failure_self', 'discounted_cumulative_success',
                'discounted_total_success', 'n_pulls_opp', 'n_pulls_tot', 'ratio_self', 'ratio_opp',
                'est_1', 'est_2', 'est_3', 'repeat_opp', 'n_pulls_self_last_10', 'n_pulls_opp_last_10', 'ratio_self_last_10_selected', 'payout'
    ]
    
    # Initialize training dataframe
    # - In the first round, store records for all n_machines
    # - In subsequent rounds, just store the two machines that updated
    training_data = pd.DataFrame(
            data = np.zeros((num_total_row, len(col_df))),
            index = range(num_total_row),
            columns = col_df,
    )
    training_data['round_num'] = training_data['round_num'].astype(np.int16)

    training_data['n_pulls_self'] = training_data['n_pulls_self'].astype(np.int16)
    training_data['n_success_self'] = training_data['n_success_self'].astype(np.int16)
    training_data['n_failure_self'] = training_data['n_failure_self'].astype(np.int16)

    training_data['discounted_cumulative_success'] = training_data['discounted_cumulative_success'].astype(np.float32)
    training_data['discounted_total_success'] = training_data['discounted_total_success'].astype(np.float32)

    training_data['n_pulls_opp'] = training_data['n_pulls_opp'].astype(np.int16)
    training_data['n_pulls_tot'] = training_data['n_pulls_tot'].astype(np.int16)

    training_data['ratio_self'] = training_data['ratio_self'].astype(np.float32)
    training_data['ratio_opp'] = training_data['ratio_opp'].astype(np.float32)

    training_data['est_1'] = training_data['est_1'].astype(np.float32)
    training_data['est_2'] = training_data['est_2'].astype(np.float32)
    training_data['est_3'] = training_data['est_3'].astype(np.float32)

    training_data['repeat_opp'] = training_data['repeat_opp'].astype(np.int16)
    training_data['n_pulls_self_last_10'] = training_data['n_pulls_self_last_10'].astype(np.int8)
    training_data['n_pulls_opp_last_10'] = training_data['n_pulls_opp_last_10'].astype(np.int8)

    training_data['ratio_self_last_10_selected'] = training_data['ratio_self_last_10_selected'].astype(np.float32)

    training_data['payout'] = training_data['payout'].astype(np.float32)    

    thresholds = result[0][0]['observation']['thresholds']
        
    row_ii = 0
                
    # Initialize distributions for all machines    
    n_pulls_self = np.array([0. for _ in range(n_machines)], dtype = np.int16)
    n_success_self = np.array([0. for _ in range(n_machines)], dtype = np.int16)
    n_failure_self = np.array([0. for _ in range(n_machines)], dtype = np.int16)
    discounted_cumulative_success = np.array([0. for _ in range(n_machines)], dtype = np.float32)
    discounted_total_success = np.array([0. for _ in range(n_machines)], dtype = np.float32)
    n_pulls_opp = np.array([0. for _ in range(n_machines)], dtype = np.int16)
    n_pulls_tot = np.array([0. for _ in range(n_machines)], dtype = np.int16)
    ratio_self = np.array([0. for _ in range(n_machines)], dtype = np.float32)
    ratio_opp = np.array([0. for _ in range(n_machines)], dtype = np.float32)
    est_1 = np.array([0. for _ in range(n_machines)], dtype = np.float32)
    est_2 = np.array([0. for _ in range(n_machines)], dtype = np.float32)
    est_3 = np.array([0. for _ in range(n_machines)], dtype = np.float32)
    repeat_opp = np.array([0. for _ in range(n_machines)], dtype = np.int16)
    n_pulls_self_last_10 = np.array([0. for _ in range(n_machines)], dtype = np.int8)
    n_pulls_opp_last_10 = np.array([0. for _ in range(n_machines)], dtype = np.int8)
    ratio_self_last_10_selected = np.array([0. for _ in range(n_machines)], dtype = np.float32)

    # Track winnings
    last_reward_count = 0

    last_action = [-1, -1]
    
    #used for n_pulls_self_last_10, n_pulls_opp_last_10
    self_memory, opp_memory = [], []
    
    #used for ratio_self_last_10_selected
    self_memory_per_action = [[] for _ in range(n_machines)]
    
    for round_num, res in enumerate(result[1:], start = 1):
        curr_total_reward, last_m_indices = res[agent_num]['reward'], res[0]['observation']['lastActions']
        thresholds = np.array(res[0]['observation']['thresholds'], dtype = np.float32)

        last_reward = curr_total_reward - last_reward_count
        last_reward_count = curr_total_reward

        # Update number of pulls for both machines
        m_index = last_m_indices[agent_num]
        opp_index = last_m_indices[1 - agent_num]
        
        #update memory information
        self_memory_per_action[m_index] += [last_reward]
        
        self_memory += [m_index]
        opp_memory += [opp_index]
        
        #update pulls
        n_pulls_self[m_index] += 1
        n_pulls_opp[opp_index] += 1

        n_pulls_tot[m_index] += 1
        n_pulls_tot[opp_index] += 1

        # Update number of successes
        n_success_self[m_index] += last_reward

        n_failure_self[m_index] += (1 - last_reward)

        #discounted feature
        discounted_cumulative_success[m_index] += (
                    last_reward * decay_rate ** n_pulls_tot[m_index]
        )
        discounted_total_success[m_index] = (
                    n_success_self[m_index] * decay_rate ** n_pulls_tot[m_index]
        )

        
        #ratio feature
        ratio_self[m_index] = n_success_self[m_index]/n_pulls_self[m_index]

        #opponent repeat feature
        if opp_index == last_action[1 - agent_num]:
            repeat_opp[opp_index] += 1

        else:
            repeat_opp[last_action[1 - agent_num]] = 0
            last_action[1 - agent_num] = opp_index
                
        for index in last_m_indices:

            ratio_opp[index] = n_pulls_opp[index]/(1 + round_num)

            #est feature
            est_1[index] = (n_success_self[index] - n_failure_self[index] + n_pulls_opp[index] - 1.5 * (n_pulls_opp[index]>0))/n_pulls_tot[index]

            est_2[index] = (
                (n_success_self[index] - n_failure_self[index] + n_pulls_opp[index] - 1.5 * (n_pulls_opp[index]>0) + repeat_opp[index])/n_pulls_tot[index]
            ) * math.pow(.97, n_pulls_tot[index])
            
            est_3[index] = ratio_self[index] * math.pow(.97, n_pulls_tot[index])

            #number of self/opp pulls in last 10 action features
            n_pulls_self_last_10[index] = sum([x == index for x in self_memory[-10:]])
            n_pulls_opp_last_10[index] = sum([x == index for x in opp_memory[-10:]])

            #ratio of rewrd in last 10 times i select the action
            replay_self_last_10 = self_memory_per_action[index][-10:]
            
            #if never taken then 0 else the ratio of last 10 pulls on this action
            ratio_self_last_10_selected[index] = 0 if len(replay_self_last_10) == 0 else sum(replay_self_last_10)/len(replay_self_last_10)
            
            training_data.at[row_ii, 'round_num'] = np.int16(round_num)
            training_data.at[row_ii, 'n_pulls_self'] = n_pulls_self[index]
            training_data.at[row_ii, 'n_success_self'] = n_success_self[index]
            training_data.at[row_ii, 'n_failure_self'] = n_failure_self[index]
            training_data.at[row_ii, 'discounted_cumulative_success'] = discounted_cumulative_success[index]
            training_data.at[row_ii, 'discounted_total_success'] = discounted_total_success[index]
            training_data.at[row_ii, 'n_pulls_opp'] = n_pulls_opp[index]
            training_data.at[row_ii, 'n_pulls_tot'] = n_pulls_tot[index]
            training_data.at[row_ii, 'ratio_self'] = ratio_self[index]
            training_data.at[row_ii, 'ratio_opp'] = ratio_opp[index]
            training_data.at[row_ii, 'est_1'] = est_1[index]
            training_data.at[row_ii, 'est_2'] = est_2[index]
            training_data.at[row_ii, 'est_3'] = est_3[index]
            training_data.at[row_ii, 'repeat_opp'] = repeat_opp[index]
            training_data.at[row_ii, 'n_pulls_self_last_10'] = n_pulls_self_last_10[index]
            training_data.at[row_ii, 'n_pulls_opp_last_10'] = n_pulls_opp_last_10[index]
            training_data.at[row_ii, 'ratio_self_last_10_selected'] = ratio_self_last_10_selected[index]
            training_data.at[row_ii, 'payout'] = thresholds[index]                

            row_ii += 1
    
    return training_data


In [3]:
set_game = []
training_data = []

print('Beginnin generating data...')

for rec in tqdm(record):    
    try:
        game_id = np.int32(rec.replace('.json', ''))

        with open(os.path.join(agent_dir, rec)) as file:
            data = json.load(file)
        
        team_info = data['info']
        winner_list = winner_dic[rec]
        
    except:
        print(f'Cannot read data, {game_id}\n')
        continue
    
    if data['id'] not in set_game:
            use_it = use_this_match(data['steps'])

            if use_it:
                for winner in winner_list:
                    try:
                        sub_id = int(episode_agents.query(f'EpisodeId == {game_id} and Index == {winner}')['SubmissionId'])
                        score_sub = int(last_submission_update.query(f'SubmissionId == {sub_id}')['UpdatedScore'])
                        
                        history = log_training(data['steps'], winner)
                        history['game_id'] = game_id
                        history['score_agent'] = score_sub 
                        training_data.append(history)
                        
                        del history
                        gc.collect()

                        set_game += [data['id']]

                    except:
                        print(f'Game error, {game_id}, {team_info}\n')
                        pass

            else:
                print(f'Too many repeat or not enough round, {game_id}, {team_info}\n')
    else:
        print(f'Duplicated game, {game_id}, {team_info}\n')

Beginnin generating data...


HBox(children=(FloatProgress(value=0.0, max=3627.0), HTML(value='')))

Too many repeat or not enough round, 9719597, {'EpisodeId': 9719597, 'LiveVideoPath': None, 'TeamNames': ['nagiss', 'Ilya Plemian']}

Too many repeat or not enough round, 9060576, {'EpisodeId': 9060576, 'LiveVideoPath': None, 'TeamNames': ['nagiss', 'Ilya Plemian']}

Too many repeat or not enough round, 11112265, {'EpisodeId': 11112265, 'LiveVideoPath': None, 'TeamNames': ['Ehsan ☃️', 'toshi_k']}

Too many repeat or not enough round, 9074574, {'EpisodeId': 9074574, 'LiveVideoPath': None, 'TeamNames': ['Ilya Plemian', 'nagiss']}

Too many repeat or not enough round, 7958346, {'EpisodeId': 7958346, 'LiveVideoPath': None, 'TeamNames': ['nagiss', 'Ilya Plemian']}

Too many repeat or not enough round, 8697712, {'EpisodeId': 8697712, 'LiveVideoPath': None, 'TeamNames': ['Ilya Plemian', 'nagiss']}

Too many repeat or not enough round, 8881515, {'EpisodeId': 8881515, 'LiveVideoPath': None, 'TeamNames': ['nagiss', 'Ilya Plemian']}

Too many repeat or not enough round, 8703824, {'EpisodeId': 870

In [4]:
del episode_agents, last_submission_update
gc.collect()

23

In [5]:
training_data = pd.concat(training_data, axis=0, ignore_index = True)

training_data['round_num'] = training_data['round_num'].astype(np.int16)

training_data['n_pulls_self'] = training_data['n_pulls_self'].astype(np.int16)
training_data['n_success_self'] = training_data['n_success_self'].astype(np.int16)
training_data['n_failure_self'] = training_data['n_failure_self'].astype(np.int16)

training_data['discounted_cumulative_success'] = training_data['discounted_cumulative_success'].astype(np.float32)
training_data['discounted_total_success'] = training_data['discounted_total_success'].astype(np.float32)

training_data['n_pulls_opp'] = training_data['n_pulls_opp'].astype(np.int16)
training_data['n_pulls_tot'] = training_data['n_pulls_tot'].astype(np.int16)

training_data['ratio_self'] = training_data['ratio_self'].astype(np.float32)
training_data['ratio_opp'] = training_data['ratio_opp'].astype(np.float32)

training_data['est_1'] = training_data['est_1'].astype(np.float32)
training_data['est_2'] = training_data['est_2'].astype(np.float32)
training_data['est_3'] = training_data['est_3'].astype(np.float32)

training_data['repeat_opp'] = training_data['repeat_opp'].astype(np.int16)
training_data['n_pulls_self_last_10'] = training_data['n_pulls_self_last_10'].astype(np.int8)
training_data['n_pulls_opp_last_10'] = training_data['n_pulls_opp_last_10'].astype(np.int8)

training_data['ratio_self_last_10_selected'] = training_data['ratio_self_last_10_selected'].astype(np.float32)

training_data['payout'] = training_data['payout'].astype(np.float32)

training_data['game_id'] = training_data['game_id'].astype(np.int32)
training_data['score_agent'] = training_data['score_agent'].astype(np.int16)

In [6]:
training_data.to_pickle('training_data.pkl')