In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import socceraction

from statsbombpy import sb

In [2]:
from socceraction.data.statsbomb import StatsBombLoader

path = 'D:\Football Data Science\Data\Event\Statsbomb\data'

api = StatsBombLoader(getter="local", root=path)

df_games = api.games(competition_id=11, season_id=27) # 西甲2015/2016赛季

df_games.head(3)

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee
0,3825848,27,11,Regular Season,5,2015-09-23 20:00:00,221,322,2,2,Estadio Ciudad de Valencia,
1,3825895,27,11,Regular Season,5,2015-09-23 22:00:00,208,213,2,0,Estadio de Gran Canaria,Carlos del Cerro Grande
2,3825894,27,11,Regular Season,36,2016-05-01 18:15:00,219,216,0,2,Estadio Abanca-Riazor,Carlos Clos Gómez


In [3]:
def new_label(df_events, df_atomic_actions, k=10, m=10):
    
    # 对于score / goal：典型的atomic-SPADL发生方式是 shot(label = xG) -> goal(label = 1)
    # 对于concede / owngoal：典型的atomic-SPADL发生方式是 badtouch(label = 0) -> owngoal(label = 1)
    # 处理方法：xG需要往前递推k个标签（或者碰到别的已有标签），而goal不需要（因为前面一个一定是shot）
    # 但owngoal需要往前递推m个标签（或者碰到别的已有标签）
    # 所以先将goal / owngoal分别填入score / concede，再从上到下处理每个xG和concede
    # 即对于每个xG：如果前序action的team_id与xG的team_id一致则填进score=xG；如果不一致则填进concede=xG
    # 对于每个owngoal：如果前序action的team_id与xG的team_id一致则填进score=1；如果不一致则填进concede=1
    # 经过以上处理，goal(1) / owngoal(-1) / xG全部都被纳入label里面
    
    shots = df_events[df_events['type_name'] == 'Shot']
    xGs_shot = pd.DataFrame({'original_event_id':shots['event_id'].tolist(),
                             'xG':pd.DataFrame([x['shot'] for x in shots['extra']])['statsbomb_xg'],
                             'type_name':'shot'})
    xGs_penalty = pd.DataFrame({'original_event_id':shots['event_id'].tolist(),
                             'xG':pd.DataFrame([x['shot'] for x in shots['extra']])['statsbomb_xg'],
                             'type_name':'shot_penalty'})
    xGs_freekick = pd.DataFrame({'original_event_id':shots['event_id'].tolist(),
                             'xG':pd.DataFrame([x['shot'] for x in shots['extra']])['statsbomb_xg'],
                             'type_name':'shot_freekick'})
    xGs = pd.concat([xGs_shot, xGs_penalty, xGs_freekick])
    xGs = pd.merge(df_atomic_actions, xGs, on=['original_event_id', 'type_name'], how='left')[['team_id','xG']]

    scores, concedes = np.zeros(xGs.shape[0]), np.zeros(xGs.shape[0])
    goal_index = df_atomic_actions[df_atomic_actions['type_name'] == 'goal'].index.tolist()
    scores[goal_index] = 1
    owngoal_index = df_atomic_actions[df_atomic_actions['type_name'] == 'owngoal'].index.tolist()
    concedes[owngoal_index] = 1

    for i in xGs[xGs['xG'].notna()].index: # 对于每次xG
            shot_team_id = xGs.loc[i,'team_id'] # 射门的球队
            xG = xGs.loc[i,'xG'] # 射门的xG
            scores[i] = xG
            for j in range(1,k): # 往前递推1~(k-1)个动作
                    if i < j: break
                    if (scores[i-j] >= 1e-8) or (concedes[i-j] >= 1e-8): break # 这个位置有标签了
                    if xGs.loc[i-j,'team_id'] == shot_team_id: 
                            scores[i-j] = xG
                    elif xGs.loc[i-j,'team_id'] != shot_team_id: 
                            concedes[i-j] = xG           

    for i in owngoal_index: # 对于每次owngoal
        if concedes[i] != 1: print('error')
        else:
                concede_team_id = df_atomic_actions.loc[i, 'team_id'] # 乌龙球的球队
                for j in range(1,m): # 往前递推1~(m-1)个动作
                        if i < j: break
                        if (scores[i-j] >= 1e-8) or (concedes[i-j] >= 1e-8): break # 这个位置有标签了
                        if df_atomic_actions.loc[i-j,'team_id'] == concede_team_id: 
                                concedes[i-j] = 1
                        elif df_atomic_actions.loc[i-j,'team_id'] != concede_team_id: 
                                scores[i-j] = 1
                                        
    Y = pd.DataFrame({'scores':scores, 'concedes':concedes})
    return(Y)

In [4]:
import socceraction.spadl as spadl
import socceraction.atomic.spadl as atomicspadl
from atomic_adjusted.spadl.base import convert_to_atomic as convert_to_atomic_adjusted

import socceraction.atomic.vaep.features as fs
import socceraction.atomic.vaep.labels as lab
import socceraction.atomic.vaep.formula as vaepformula
import xgboost

i = 0
Xs, Ys, infos = [], [], []

for game_id in df_games['game_id']:
    df_events = api.events(game_id)
    
    home_team_id = int(df_games[df_games['game_id'] == game_id]["home_team_id"])
    df_actions = spadl.statsbomb.convert_to_actions(df_events, home_team_id)
    df_atomic_actions = convert_to_atomic_adjusted(df_actions) # 如果效果不好，可以考虑重新simplify
    df_atomic_actions = atomicspadl.add_names(df_atomic_actions)
    df_atomic_actions['type_id'] = df_atomic_actions['type_id'].replace(10, 24)
    
    infos.append(df_atomic_actions)
    
    # 1. convert actions to game states
    gamestates = fs.gamestates(df_atomic_actions, 3) # 之前的连续三个动作作为特征
    gamestates = fs.play_left_to_right(gamestates, home_team_id)
    # gamestates[0],gamestates[1],gamestates[2]分别是：df_actions,df_actions(lag1),df_actions(lag2)
    
    # 2. compute features
    xfns = [fs.actiontype_onehot, fs.bodypart_onehot, fs.time, # 要提前把动作类型转换为独热编码，否则receival这种新动作会变成NAN
            fs.team, fs.time_delta, fs.location, fs.polar, fs.movement_polar, fs.direction, fs.goalscore] # 内置的所有特征
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    Xs.append(X)

    # 3. compute labels
    Y = new_label(df_events, df_atomic_actions)
    Ys.append(Y)
    
    i += 1
    if i % 10 == 0: print(i, end=' ')
    
X = pd.concat(Xs, ignore_index=True)
Y = pd.concat(Ys, ignore_index=True)
info = pd.concat(infos, ignore_index=True)

X.shape, Y.shape, info.shape

10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 

((1126800, 148), (1126800, 2), (1126800, 15))

In [5]:
X

Unnamed: 0,actiontype_pass_a0,actiontype_cross_a0,actiontype_throw_in_a0,actiontype_freekick_crossed_a0,actiontype_freekick_short_a0,actiontype_corner_crossed_a0,actiontype_corner_short_a0,actiontype_take_on_a0,actiontype_foul_a0,actiontype_tackle_a0,...,mov_angle_a2,dx_a0,dy_a0,dx_a1,dy_a1,dx_a2,dy_a2,goalscore_team,goalscore_opponent,goalscore_diff
0,True,False,False,False,False,False,False,False,False,False,...,-1.414803,0.155362,-0.987858,0.155362,-0.987858,0.155362,-0.987858,0,0,0
1,False,False,False,False,False,False,False,False,False,False,...,-1.414803,-0.000000,-0.000000,0.155362,-0.987858,0.155362,-0.987858,0,0,0
2,False,False,False,False,False,False,False,False,False,False,...,-1.414803,-0.152603,0.988288,-0.000000,-0.000000,0.155362,-0.987858,0,0,0
3,True,False,False,False,False,False,False,False,False,False,...,0.000000,-0.547972,0.836496,-0.152603,0.988288,-0.000000,-0.000000,0,0,0
4,False,False,False,False,False,False,False,False,False,False,...,1.723998,-0.000000,-0.000000,-0.547972,0.836496,-0.152603,0.988288,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126795,False,False,False,False,False,False,False,False,False,False,...,0.000000,0.000000,0.000000,-0.999918,-0.012781,0.000000,0.000000,1,2,-1
1126796,True,False,False,False,False,False,False,False,False,False,...,-3.128811,-0.162184,0.986761,0.000000,0.000000,-0.999918,-0.012781,1,2,-1
1126797,False,False,False,False,False,False,False,False,False,False,...,0.000000,0.000000,0.000000,-0.162184,0.986761,0.000000,0.000000,1,2,-1
1126798,False,False,False,False,False,False,False,False,False,False,...,1.733700,1.000000,0.000000,0.000000,0.000000,-0.162184,0.986761,1,2,-1


### 对于actiontype的处理

In [6]:
# 检查动作类型独热编码唯一性
if (X.loc[:, 'actiontype_pass_a0':'actiontype_freekick_a0'].sum(axis=1) != 1).sum() > 0: print('error')

# 检查动作类型数量
actions_cnt = X.loc[:, 'actiontype_pass_a0':'actiontype_freekick_a0'].sum()

to_delete_type = ['actiontype_keeper_pick_up_a0', 'actiontype_non_action_a0', 'actiontype_corner_a0', 'actiontype_freekick_a0']
if actions_cnt[to_delete_type].sum() > 0: print('error')
actions_cnt = actions_cnt[~actions_cnt.index.isin(to_delete_type)]

index_mapping_dict = {i: i.lstrip('actiontype').rstrip('a0')[1:-1] for i in actions_cnt.index.tolist()}
actions_cnt.index = actions_cnt.index.map(index_mapping_dict)
actions_cnt = actions_cnt.sort_values(ascending=False)

actions_cnt

pass                314978
dribble             299905
receival            285457
interception         77160
throw_in             17389
clearance            16217
out                  15473
tackle               15052
take_on              14550
foul                 12136
bad_touch            10451
cross                 8677
shot                  8662
goalkick              5974
freekick_short        5790
freekick_crossed      5698
corner_crossed        2897
keeper_save           2195
keeper_claim          1741
offside               1705
yellow_card           1695
goal                  1014
corner_short           872
keeper_punch           550
shot_freekick          409
shot_penalty            97
owngoal                 29
red_card                27
dtype: int64

In [7]:
to_delete_type = ['actiontype_keeper_pick_up_a0', 'actiontype_non_action_a0', 'actiontype_corner_a0', 'actiontype_freekick_a0',
                  'actiontype_keeper_pick_up_a1', 'actiontype_non_action_a1', 'actiontype_corner_a1', 'actiontype_freekick_a1',
                  'actiontype_keeper_pick_up_a2', 'actiontype_non_action_a2', 'actiontype_corner_a2', 'actiontype_freekick_a2']
X = X.drop(columns=to_delete_type) # 删掉这些完全没有的动作类型

X['actiontype_a0'] = X.loc[:, 'actiontype_pass_a0':'actiontype_red_card_a0'].idxmax(axis=1).str.lstrip('actiontype').str.rstrip('a0').str.strip('_')
X['actiontype_a1'] = X.loc[:, 'actiontype_pass_a1':'actiontype_red_card_a1'].idxmax(axis=1).str.lstrip('actiontype').str.rstrip('a1').str.strip('_')
X['actiontype_a2'] = X.loc[:, 'actiontype_pass_a2':'actiontype_red_card_a2'].idxmax(axis=1).str.lstrip('actiontype').str.rstrip('a2').str.strip('_')
X = X.loc[:, 'bodypart_foot_a0':] # 重新转换为分类变量

X

Unnamed: 0,bodypart_foot_a0,bodypart_head_a0,bodypart_other_a0,bodypart_head/other_a0,bodypart_foot_a1,bodypart_head_a1,bodypart_other_a1,bodypart_head/other_a1,bodypart_foot_a2,bodypart_head_a2,...,dx_a1,dy_a1,dx_a2,dy_a2,goalscore_team,goalscore_opponent,goalscore_diff,actiontype_a0,actiontype_a1,actiontype_a2
0,True,False,False,False,True,False,False,False,True,False,...,0.155362,-0.987858,0.155362,-0.987858,0,0,0,pass,pass,pass
1,True,False,False,False,True,False,False,False,True,False,...,0.155362,-0.987858,0.155362,-0.987858,0,0,0,receival,pass,pass
2,True,False,False,False,True,False,False,False,True,False,...,-0.000000,-0.000000,0.155362,-0.987858,0,0,0,dribble,receival,pass
3,True,False,False,False,True,False,False,False,True,False,...,-0.152603,0.988288,-0.000000,-0.000000,0,0,0,pass,dribble,receival
4,True,False,False,False,True,False,False,False,True,False,...,-0.547972,0.836496,-0.152603,0.988288,0,0,0,receival,pass,dribble
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126795,True,False,False,False,True,False,False,False,True,False,...,-0.999918,-0.012781,0.000000,0.000000,1,2,-1,interception,goalkick,out
1126796,True,False,False,False,True,False,False,False,True,False,...,0.000000,0.000000,-0.999918,-0.012781,1,2,-1,pass,interception,goalkick
1126797,True,False,False,False,True,False,False,False,True,False,...,-0.162184,0.986761,0.000000,0.000000,1,2,-1,receival,pass,interception
1126798,True,False,False,False,True,False,False,False,True,False,...,0.000000,0.000000,-0.162184,0.986761,1,2,-1,dribble,receival,pass


### 对于bodypart的处理

In [8]:
# 检查身体部位独热编码唯一性
if (X['bodypart_head_a0'] + X['bodypart_other_a0'] != X['bodypart_head/other_a0']).sum() > 0: print('error')
X = X.drop(columns=['bodypart_head/other_a0', 'bodypart_head/other_a1', 'bodypart_head/other_a2']) # 删除多余类别head/other

if (X.loc[:, 'bodypart_foot_a0':'bodypart_other_a0'].sum(axis=1) != 1).sum() > 0: print('error')

X['bodypart_a0'] = X.loc[:, 'bodypart_foot_a0':'bodypart_other_a0'].idxmax(axis=1).str.lstrip('bodypart').str.rstrip('a0').str.strip('_')
X['bodypart_a1'] = X.loc[:, 'bodypart_foot_a1':'bodypart_other_a1'].idxmax(axis=1).str.lstrip('bodypart').str.rstrip('a1').str.strip('_')
X['bodypart_a2'] = X.loc[:, 'bodypart_foot_a2':'bodypart_other_a2'].idxmax(axis=1).str.lstrip('bodypart').str.rstrip('a2').str.strip('_')
X = X.loc[:, 'period_id_a0':] # 重新转换为分类变量

X

Unnamed: 0,period_id_a0,time_seconds_a0,time_seconds_overall_a0,period_id_a1,time_seconds_a1,time_seconds_overall_a1,period_id_a2,time_seconds_a2,time_seconds_overall_a2,team_1,...,dy_a2,goalscore_team,goalscore_opponent,goalscore_diff,actiontype_a0,actiontype_a1,actiontype_a2,bodypart_a0,bodypart_a1,bodypart_a2
0,1,0.5000,0.5000,1.0,0.5000,0.5000,1.0,0.5000,0.5000,True,...,-0.987858,0,0,0,pass,pass,pass,foot,foot,foot
1,1,0.6215,0.6215,1.0,0.5000,0.5000,1.0,0.5000,0.5000,True,...,-0.987858,0,0,0,receival,pass,pass,foot,foot,foot
2,1,0.7430,0.7430,1.0,0.6215,0.6215,1.0,0.5000,0.5000,True,...,-0.987858,0,0,0,dribble,receival,pass,foot,foot,foot
3,1,0.9860,0.9860,1.0,0.7430,0.7430,1.0,0.6215,0.6215,True,...,-0.000000,0,0,0,pass,dribble,receival,foot,foot,foot
4,1,1.7590,1.7590,1.0,0.9860,0.9860,1.0,0.7430,0.7430,True,...,0.988288,0,0,0,receival,pass,dribble,foot,foot,foot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126795,2,2817.0560,5517.0560,2.0,2815.7490,5515.7490,2.0,2796.5470,5496.5470,False,...,0.000000,1,2,-1,interception,goalkick,out,foot,foot,foot
1126796,2,2818.3630,5518.3630,2.0,2817.0560,5517.0560,2.0,2815.7490,5515.7490,True,...,-0.012781,1,2,-1,pass,interception,goalkick,foot,foot,foot
1126797,2,2819.0760,5519.0760,2.0,2818.3630,5518.3630,2.0,2817.0560,5517.0560,True,...,0.000000,1,2,-1,receival,pass,interception,foot,foot,foot
1126798,2,2819.7890,5519.7890,2.0,2819.0760,5519.0760,2.0,2818.3630,5518.3630,True,...,0.986761,1,2,-1,dribble,receival,pass,foot,foot,foot


### 对于period的额外处理

In [9]:
X['period_id_a0'] = X['period_id_a0'].astype(int)
X['period_id_a1'] = X['period_id_a1'].astype(int)
X['period_id_a2'] = X['period_id_a2'].astype(int)

X

Unnamed: 0,period_id_a0,time_seconds_a0,time_seconds_overall_a0,period_id_a1,time_seconds_a1,time_seconds_overall_a1,period_id_a2,time_seconds_a2,time_seconds_overall_a2,team_1,...,dy_a2,goalscore_team,goalscore_opponent,goalscore_diff,actiontype_a0,actiontype_a1,actiontype_a2,bodypart_a0,bodypart_a1,bodypart_a2
0,1,0.5000,0.5000,1,0.5000,0.5000,1,0.5000,0.5000,True,...,-0.987858,0,0,0,pass,pass,pass,foot,foot,foot
1,1,0.6215,0.6215,1,0.5000,0.5000,1,0.5000,0.5000,True,...,-0.987858,0,0,0,receival,pass,pass,foot,foot,foot
2,1,0.7430,0.7430,1,0.6215,0.6215,1,0.5000,0.5000,True,...,-0.987858,0,0,0,dribble,receival,pass,foot,foot,foot
3,1,0.9860,0.9860,1,0.7430,0.7430,1,0.6215,0.6215,True,...,-0.000000,0,0,0,pass,dribble,receival,foot,foot,foot
4,1,1.7590,1.7590,1,0.9860,0.9860,1,0.7430,0.7430,True,...,0.988288,0,0,0,receival,pass,dribble,foot,foot,foot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126795,2,2817.0560,5517.0560,2,2815.7490,5515.7490,2,2796.5470,5496.5470,False,...,0.000000,1,2,-1,interception,goalkick,out,foot,foot,foot
1126796,2,2818.3630,5518.3630,2,2817.0560,5517.0560,2,2815.7490,5515.7490,True,...,-0.012781,1,2,-1,pass,interception,goalkick,foot,foot,foot
1126797,2,2819.0760,5519.0760,2,2818.3630,5518.3630,2,2817.0560,5517.0560,True,...,0.000000,1,2,-1,receival,pass,interception,foot,foot,foot
1126798,2,2819.7890,5519.7890,2,2819.0760,5519.0760,2,2818.3630,5518.3630,True,...,0.986761,1,2,-1,dribble,receival,pass,foot,foot,foot


### 检查所有变量

In [10]:
X.columns[[0,3,6]] # category (int:1/2)

Index(['period_id_a0', 'period_id_a1', 'period_id_a2'], dtype='object')

In [11]:
X.columns[[9,10]] # category (bool)

Index(['team_1', 'team_2'], dtype='object')

In [12]:
X.columns[[1,2,4,5,7,8]], X.columns[11:37] # float

(Index(['time_seconds_a0', 'time_seconds_overall_a0', 'time_seconds_a1',
        'time_seconds_overall_a1', 'time_seconds_a2',
        'time_seconds_overall_a2'],
       dtype='object'),
 Index(['time_delta_1', 'time_delta_2', 'x_a0', 'y_a0', 'x_a1', 'y_a1', 'x_a2',
        'y_a2', 'dist_to_goal_a0', 'angle_to_goal_a0', 'dist_to_goal_a1',
        'angle_to_goal_a1', 'dist_to_goal_a2', 'angle_to_goal_a2', 'mov_d_a0',
        'mov_angle_a0', 'mov_d_a1', 'mov_angle_a1', 'mov_d_a2', 'mov_angle_a2',
        'dx_a0', 'dy_a0', 'dx_a1', 'dy_a1', 'dx_a2', 'dy_a2'],
       dtype='object'))

In [13]:
X.columns[37:40] # int

Index(['goalscore_team', 'goalscore_opponent', 'goalscore_diff'], dtype='object')

In [14]:
X.columns[40:] # category

Index(['actiontype_a0', 'actiontype_a1', 'actiontype_a2', 'bodypart_a0',
       'bodypart_a1', 'bodypart_a2'],
      dtype='object')

In [15]:
X.to_csv('data/X_1516_LaLiga.csv')
Y.to_csv('data/Y_1516_LaLiga.csv')
info.to_csv('data/info_1516_LaLiga.csv')