# This is the fourth step in project.

Motivaton: Feature engineering from events data for improvised model training.

Final output: one pickle file representing final dataframe used for improvised model.

In [1]:
# import required libraries
import os
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [2]:
# file paths
ROOT = os.path.join(os.getcwd(), 'Statsbomb_data\open-data-master')
DATA = os.path.join(ROOT, 'data')
FINAL_DF = os.path.join(ROOT,'final_dataframes')

In [3]:
''' Importing events data in a dataframe.

Variables:-
FINAL_DF - string - file path to all the pickle files
'''
events_df = pd.read_pickle(
    filepath_or_buffer=os.path.join(FINAL_DF,'events.pkl')
)

# Feature Engineering: Strong Foot

In [4]:
''' creating another dataframe from events data. 

Variables:-
events_df - DataFrame - events data
'''
player_df = events_df[['id','player','shot']]

In [5]:
''' Creating additional columns in events data.

Variables:-
events_df - DataFrame - events data
'''
events_df['player_id'] = events_df['player'].str['id']
events_df['player_name'] = events_df['player'].str['name']
events_df['outcome'] = events_df['shot'].str['outcome'].str['name']
events_df['body_part'] = events_df['shot'].str['body_part'].str['name']

In [6]:
''' creating a seperate dataframe including the new feature to keep events dataframe intact.
    In this new dataframe, I am including the shots that were converted to a goal only.

Variables:-
events_df - DataFrame - events data
'''
only_goal_df = events_df.loc[events_df['outcome']=='Goal']

In [7]:
''' Grouping the dataframe by player names. These groups will help me find out the strong foot for each player.

Variables:-
only_goal_df - DataFrame - events having the goals only
'''
player_groups = only_goal_df.groupby(
    by='player_name'
)

In [8]:
''' Finding the mode of body_part column to find out preferred part for each player.

Variables:-
player_groups - Pandas Groupby object - grouped dataframe by player
'''
strong_foot_mapping = player_groups.agg({'body_part':pd.Series.mode})

In [9]:
''' Some players have same goals scored from more than 1 body part. We need to remove those players to remove misleading information. 

Variables:-
strong_foot_mapping - Series - mapping of player and its strong foot
'''
filtered_players = strong_foot_mapping.loc[strong_foot_mapping.body_part.str[0].str.len()==1]

In [10]:
''' Now I have the strong foot mapping. I will create a dataframe joining the strong foot column with other columns of event data. 

Variables:-
events_df - DataFrame - events data
filtered_players - Series - strong foot column
'''

strong_foot_df=events_df.join(
    other=filtered_players,
    on='player_name',
    how='left',
    rsuffix='_strong'
)

In [11]:
''' The strong_foot column has null values for the players having ambiguous data.

Variables:-
strong_foot_df - DataFrame - dataframe with strong foot feature
'''
strong_foot_df.body_part_strong.unique()

array([nan, 'Left Foot', 'Right Foot', 'Head', 'Other'], dtype=object)

# Feature Engineering: Within Penalty Area (WPA)

A shot taken withing penalty area gives high xG value as it is easy to score the goal from a place closer to the goal post.

In [12]:
''' Creating a numpy array from events data, filtered on the location of shot.
    If the shot is taken withing the penalty box, the value is True otherwise False.
    The coordinate information is provided by statsbomb and mentioned in the Appendix II of the report.
    
    Variables:-
    strong_foot_df - DataFrame - dataframe with strong foot feature    
'''
wpa = np.where(
    (strong_foot_df['location'].str[0]>=114)&(strong_foot_df['location'].str[0]<=120) &
    (strong_foot_df['location'].str[1]>=30)&(strong_foot_df['location'].str[1]<=50),
    True,False
)

In [13]:
''' creating a dataframe with within_penalty_area feature and all events columns

Variables:-
strong_foot_df - DataFrame - dataframe with strong foot feature
'''
wpa_df = strong_foot_df
wpa_df['wpa'] = wpa
del strong_foot_df

In [14]:
wpa_df.head()

Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,...,match_id,under_pressure,out,off_camera,player_id,player_name,outcome,body_part,body_part_strong,wpa
0,682270cc-4bc4-4952-8f91-d3c5a704a691,153,1,00:02:33.865,2,33,"{'id': 16, 'name': 'Shot'}",7,"{'id': 24, 'name': 'Liverpool'}","{'id': 4, 'name': 'From Throw In'}",...,18245,,,,3567,Georginio Wijnaldum,Blocked,Right Foot,,False
1,9f5aa3eb-3bed-4bc0-97a5-bb8444b235b9,245,1,00:05:14.065,5,14,"{'id': 16, 'name': 'Shot'}",13,"{'id': 24, 'name': 'Liverpool'}","{'id': 3, 'name': 'From Free Kick'}",...,18245,True,,,3531,Mohamed Salah,Blocked,Left Foot,Left Foot,False
2,399ac143-5f7b-4080-8c0b-3c18435d7fc1,398,1,00:07:56.816,7,56,"{'id': 16, 'name': 'Shot'}",19,"{'id': 24, 'name': 'Liverpool'}","{'id': 1, 'name': 'Regular Play'}",...,18245,,,,3535,Roberto Firmino Barbosa de Oliveira,Off T,Head,,False
3,660d9d98-46b6-4b5e-9c9a-435d63142c93,467,1,00:10:02.932,10,2,"{'id': 16, 'name': 'Shot'}",24,"{'id': 220, 'name': 'Real Madrid'}","{'id': 3, 'name': 'From Free Kick'}",...,18245,,,,5552,Marcelo Vieira da Silva Júnior,Off T,Left Foot,Left Foot,False
4,fe6c7f60-2ff0-4077-882e-b045c8abc7c3,628,1,00:13:02.999,13,2,"{'id': 16, 'name': 'Shot'}",28,"{'id': 24, 'name': 'Liverpool'}","{'id': 3, 'name': 'From Free Kick'}",...,18245,,,,3473,James Philip Milner,Blocked,Left Foot,,False


# Feature Engineering: Distance from Goal (DFG)

The distance of shot-taking player from the goal is a straightforward piece of information for describing a good chance. Long distance is difficult to get converted to a goal.

In [15]:
''' Calculating distance from goal.
    The distance is calculated by fetching shot's location and goal post coordinates provided by statsbomb.
    The locations are in the form of (x,y) coordinates
    The formula used for distance is underroot(squre(x1-x2) + square(y1-y2))
'''
shot_location = wpa_df['location'].to_numpy() # Fetch location
shot_location = np.vstack(shot_location) # Create a 2d numpy array of x and y coordinates
distance_from_goal = np.sqrt(np.square(shot_location[:,0]-120)+np.square(shot_location[:,1]-40)) # calculate distance of all coordinates

In [16]:
''' Normalizing the distances between 0 and 1 using min-max normalization.

Variables:-
distance_from_goal - Numpy array - distance from goal values of all shot events
'''
normalized_dfg = (distance_from_goal-np.min(distance_from_goal))/(np.max(distance_from_goal)-np.min(distance_from_goal))

In [22]:
''' Storing distance values in a dataframe with other events columns

Variables:-
wpa_df - DataFrame - within penalty area dataframe
normalized_dfg - Numpy array - normalized distances
'''
dfg_df = wpa_df
dfg_df['dfg'] = normalized_dfg
del wpa_df


In [23]:
dfg_df.head(1)

Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,...,under_pressure,out,off_camera,player_id,player_name,outcome,body_part,body_part_strong,wpa,dfg
0,682270cc-4bc4-4952-8f91-d3c5a704a691,153,1,00:02:33.865,2,33,"{'id': 16, 'name': 'Shot'}",7,"{'id': 24, 'name': 'Liverpool'}","{'id': 4, 'name': 'From Throw In'}",...,,,,3567,Georginio Wijnaldum,Blocked,Right Foot,,False,0.303792


# Feature Engineering: Shot Angle

Shot angles determine the complexity of the shot taker to score the goal. Tight angles require clever shooting technique and hence provides a lower xG value.

In [20]:
''' Storing goal post coordinates. '''
first_post = [120, 36]
second_post = [120, 44]

In [24]:
''' Calculating angle between two lines.
    The first line connects shot location with first goal post.
    The second line connects shot location with second goal post.
    The angle is calculated using vector dot product method. Referred from https://stackoverflow.com/questions/58953047/issue-with-finding-angle-between-3-points-in-python
    
    Variables:-
    first_post - list - coordinates of first goal post
    second_post - list - coordinates of second goal post
'''
# Calculating components of vector dot product calculation
v21_0 = first_post[0]-shot_location[:,0]
v21_1 = first_post[1]-shot_location[:,1]
v23_0 = second_post[0]-shot_location[:,0]
v23_1 = second_post[1]-shot_location[:,1]

dot = (v21_0*v23_0) + (v21_1*v23_1)
det = (v21_0*v23_1) - (v21_1*v23_0)

# arc2tan numpy functions is used to calculate the angle in radians, followed by a conversion to degrees by numpy's rad2deg
angle = np.rad2deg(np.arctan2(det, dot))
# normalizing degree angles between 0 and 1
normalized_angle = (angle-np.min(angle))/(np.max(angle)-np.min(angle))

In [27]:
''' Storing angle column with other events columns.

Variables:-
dfg_df - DataFrame - distance from goal dataframe
normalized_angle - numpy array - normalized angles
'''
angle_df = dfg_df
angle_df['angle'] = normalized_angle
del dfg_df

In [28]:
angle_df.head(1)

Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,...,out,off_camera,player_id,player_name,outcome,body_part,body_part_strong,wpa,dfg,angle
0,682270cc-4bc4-4952-8f91-d3c5a704a691,153,1,00:02:33.865,2,33,"{'id': 16, 'name': 'Shot'}",7,"{'id': 24, 'name': 'Liverpool'}","{'id': 4, 'name': 'From Throw In'}",...,,,3567,Georginio Wijnaldum,Blocked,Right Foot,,False,0.303792,0.088568


# Feature Engineering: GK Distance from Center of the Goal (GK_Distance)

Goal keeper's position during a shot-taking event can say a lot about how easy or difficult it will be to save the shot, indirectly indicating the chance of goal.

In [30]:
angle_df.columns

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'player',
       'position', 'location', 'duration', 'related_events', 'shot',
       'match_id', 'under_pressure', 'out', 'off_camera', 'player_id',
       'player_name', 'outcome', 'body_part', 'body_part_strong', 'wpa', 'dfg',
       'angle'],
      dtype='object')

In [32]:
''' Creating a freeze frame dataframe.
    Freeze frame is a dictionary of players involved while taking the shot and their locations in (x,y) coordinates.
    The players can be shot-taker, teammates, opposition players and goalkeeper.
    
    Variables:-
    angle_df - DataFrame - dataframe with angle and other columns
'''
angle_df['freeze_frame'] = angle_df['shot'].str['freeze_frame']
freeze_frame_df = angle_df[['id','freeze_frame']].explode(
    column='freeze_frame'
) # explode function explodes the dataframe using a nested stored in any column. 
# Here, the freeze frame has many dictionary rows. Each dictionary row will then be put in a seperate dataframe row.

# id of an involved players position. It depends on attacker, defender, midfielder or goalkeeper
freeze_frame_df['position_id'] = freeze_frame_df['freeze_frame'].str['position'].str['id'] 
# position name of an involved player. Whether the player is an LB or winger, etc.
freeze_frame_df['position_name'] = freeze_frame_df['freeze_frame'].str['position'].str['name'] 
# The location of involved player at the time of shot. Given in (x,y) coordinates
freeze_frame_df['position_location'] = freeze_frame_df['freeze_frame'].str['location'] 
# Filtering the dataframe to see the position of goalkeeper only, since we are calculating a goalkeeper's location from goal post.
freeze_frame_df = freeze_frame_df.loc[freeze_frame_df['position_id']==1] 

In [33]:
freeze_frame_df.head()

Unnamed: 0,id,freeze_frame,position_id,position_name,position_location
0,682270cc-4bc4-4952-8f91-d3c5a704a691,"{'location': [115.7, 42.7], 'player': {'id': 5...",1.0,Goalkeeper,"[115.7, 42.7]"
1,9f5aa3eb-3bed-4bc0-97a5-bb8444b235b9,"{'location': [117.5, 41.6], 'player': {'id': 5...",1.0,Goalkeeper,"[117.5, 41.6]"
2,399ac143-5f7b-4080-8c0b-3c18435d7fc1,"{'location': [118.3, 37.0], 'player': {'id': 5...",1.0,Goalkeeper,"[118.3, 37.0]"
3,660d9d98-46b6-4b5e-9c9a-435d63142c93,"{'location': [117.7, 39.1], 'player': {'id': 3...",1.0,Goalkeeper,"[117.7, 39.1]"
4,fe6c7f60-2ff0-4077-882e-b045c8abc7c3,"{'location': [118.4, 37.3], 'player': {'id': 5...",1.0,Goalkeeper,"[118.4, 37.3]"


In [34]:
''' Calculating the goal keeper distance using the same methodology as of distance from goal 

Variables:-
freeze_frame_df - DataFrame - dataframe of freeze frames
'''
gk_location = freeze_frame_df['position_location'].to_numpy()
gk_location = np.vstack(gk_location)
gk_distance_from_goal = np.sqrt(np.square(gk_location[:,0]-120)+np.square(gk_location[:,1]-40))
normalized_gk_distance_from_goal = (gk_distance_from_goal-np.min(gk_distance_from_goal))/(np.max(gk_distance_from_goal)-np.min(gk_distance_from_goal))

In [36]:
print("Sample distance values:")
freeze_frame_df['gk_distance'] = normalized_gk_distance_from_goal
freeze_frame_df.head()

Sample distance values:


Unnamed: 0,id,freeze_frame,position_id,position_name,position_location,gk_distance
0,682270cc-4bc4-4952-8f91-d3c5a704a691,"{'location': [115.7, 42.7], 'player': {'id': 5...",1.0,Goalkeeper,"[115.7, 42.7]",0.054478
1,9f5aa3eb-3bed-4bc0-97a5-bb8444b235b9,"{'location': [117.5, 41.6], 'player': {'id': 5...",1.0,Goalkeeper,"[117.5, 41.6]",0.031847
2,399ac143-5f7b-4080-8c0b-3c18435d7fc1,"{'location': [118.3, 37.0], 'player': {'id': 5...",1.0,Goalkeeper,"[118.3, 37.0]",0.036997
3,660d9d98-46b6-4b5e-9c9a-435d63142c93,"{'location': [117.7, 39.1], 'player': {'id': 3...",1.0,Goalkeeper,"[117.7, 39.1]",0.0265
4,fe6c7f60-2ff0-4077-882e-b045c8abc7c3,"{'location': [118.4, 37.3], 'player': {'id': 5...",1.0,Goalkeeper,"[118.4, 37.3]",0.033674


In [39]:
''' joining angle df and goalkeeper distance to combine all features. 

Variables:-
angle_df - DataFrame - angle and other features
freeze_frame_df - DataFrame - dataframe of freeze frames
'''
gk_distance_df = angle_df.set_index('id').join(
    other=freeze_frame_df[['id','gk_distance']].set_index('id'),
    on='id',
    how='left'
)
del angle_df


In [40]:
gk_distance_df.head(1)

Unnamed: 0_level_0,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,team,...,player_id,player_name,outcome,body_part,body_part_strong,wpa,dfg,angle,freeze_frame,gk_distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
682270cc-4bc4-4952-8f91-d3c5a704a691,153,1,00:02:33.865,2,33,"{'id': 16, 'name': 'Shot'}",7,"{'id': 24, 'name': 'Liverpool'}","{'id': 4, 'name': 'From Throw In'}","{'id': 24, 'name': 'Liverpool'}",...,3567,Georginio Wijnaldum,Blocked,Right Foot,,False,0.303792,0.088568,"[{'location': [98.0, 48.4], 'player': {'id': 5...",0.054478


# Feature Engineering: Player positional sub-group

xG depends heavily on the position in which a player usually plays. Attackers get more chance to score the goal than midfielder and defenders.

In [41]:
''' Assiging all player positions to one of the three values:
    
    1. Attack: Players including strikers and wingers
    2. Midfield: Players including left, right, mid, attack and central midfielders
    3. Defense: Players including left, right, center backs
'''
position = {
'Left Midfield': 'Midfield',
'Left Center Forward': 'Attack',
'Left Back': 'Defense',
'Right Back': 'Defense',
'Right Center Forward': 'Attack',
'Left Center Back': 'Defense',
'Right Center Back': 'Defense',
'Right Midfield': 'Midfield',
'Right Defensive Midfield': 'Midfield',
'Left Center Midfield': 'Midfield',
'Right Center Midfield': 'Midfield',
'Left Wing': 'Attack',
'Center Forward': 'Attack',
'Center Attacking Midfield': 'Midfield',
'Center Defensive Midfield': 'Midfield',
'Right Wing': 'Attack',
'Right Wing Back': 'Defense',
'Left Defensive Midfield': 'Midfield',
'Left Wing Back': 'Defense',
'Center Back': 'Defense',
'Secondary Striker': 'Attack',
'Right Attacking Midfield': 'Midfield',
'Left Attacking Midfield': 'Midfield',
'Center Midfield': 'Midfield',
'Goalkeeper': 'Goalkeeper'
}

In [42]:
''' Creating the specific_position column from the mapping created above

Variables:-
gk_distance_df - DataFrame - dataframe of goalkeeper distance and other features
'''
gk_distance_df['specific_position'] = gk_distance_df['position'].str['name'].map(position)

In [44]:
print("All newly created features")
gk_distance_df[['body_part_strong','wpa','dfg','angle','gk_distance','specific_position']].head()

All newly created features


Unnamed: 0_level_0,body_part_strong,wpa,dfg,angle,gk_distance,specific_position
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
682270cc-4bc4-4952-8f91-d3c5a704a691,,False,0.303792,0.088568,0.054478,Midfield
9f5aa3eb-3bed-4bc0-97a5-bb8444b235b9,Left Foot,False,0.316782,0.087038,0.031847,Attack
399ac143-5f7b-4080-8c0b-3c18435d7fc1,,False,0.146574,0.139148,0.036997,Attack
660d9d98-46b6-4b5e-9c9a-435d63142c93,Left Foot,False,0.348801,0.07846,0.0265,Defense
fe6c7f60-2ff0-4077-882e-b045c8abc7c3,,False,0.254419,0.107632,0.033674,Midfield


# Saving final dataframe to pickle

In [45]:
pd.to_pickle(
    obj=gk_distance_df,
    filepath_or_buffer=os.path.join(FINAL_DF,'engineered_features.pkl')
)