# This is the second step in project.

Motivaton: Prepare data for baseline model training.

Final output: one pickle file representing the final dataframe used for baseline model.

In [6]:
# import required libraries
import os
import pandas as pd
import numpy as np

In [7]:
# file paths
ROOT = os.path.join(os.getcwd(), 'Statsbomb_data\open-data-master')
DATA = os.path.join(ROOT, 'data')
FINAL_DF = os.path.join(ROOT,'final_dataframes')

# Import Events dataframe

I have selected features from events data to train baseline model.

In [8]:
''' load pickle file of events data to a dataframe.

Variables:-
FINAL_DF - string - path containing all pickle files
'''
events_df = pd.read_pickle(
    filepath_or_buffer=os.path.join(FINAL_DF,'events.pkl')
)
events_df.head()

Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,...,player,position,location,duration,related_events,shot,match_id,under_pressure,out,off_camera
0,682270cc-4bc4-4952-8f91-d3c5a704a691,153,1,00:02:33.865,2,33,"{'id': 16, 'name': 'Shot'}",7,"{'id': 24, 'name': 'Liverpool'}","{'id': 4, 'name': 'From Throw In'}",...,"{'id': 3567, 'name': 'Georginio Wijnaldum'}","{'id': 13, 'name': 'Right Center Midfield'}","[98.1, 52.1]",0.134619,"[5df2374f-5956-4ae5-a855-769c3d9bd8e9, 91c0ba0...","{'statsbomb_xg': 0.03639871, 'end_location': [...",18245,,,
1,9f5aa3eb-3bed-4bc0-97a5-bb8444b235b9,245,1,00:05:14.065,5,14,"{'id': 16, 'name': 'Shot'}",13,"{'id': 24, 'name': 'Liverpool'}","{'id': 3, 'name': 'From Free Kick'}",...,"{'id': 3531, 'name': 'Mohamed Salah'}","{'id': 17, 'name': 'Right Wing'}","[96.6, 51.5]",0.092997,"[0801d693-6cd2-4565-b9f7-8e57215326dc, 4cc5f82...","{'statsbomb_xg': 0.027975515, 'end_location': ...",18245,True,,
2,399ac143-5f7b-4080-8c0b-3c18435d7fc1,398,1,00:07:56.816,7,56,"{'id': 16, 'name': 'Shot'}",19,"{'id': 24, 'name': 'Liverpool'}","{'id': 1, 'name': 'Regular Play'}",...,"{'id': 3535, 'name': 'Roberto Firmino Barbosa ...","{'id': 23, 'name': 'Center Forward'}","[112.1, 30.6]",1.006248,[0ed95d27-7799-4809-94be-14b2af0e8199],"{'statsbomb_xg': 0.07536108, 'end_location': [...",18245,,,
3,660d9d98-46b6-4b5e-9c9a-435d63142c93,467,1,00:10:02.932,10,2,"{'id': 16, 'name': 'Shot'}",24,"{'id': 220, 'name': 'Real Madrid'}","{'id': 3, 'name': 'From Free Kick'}",...,"{'id': 5552, 'name': 'Marcelo Vieira da Silva ...","{'id': 6, 'name': 'Left Back'}","[94.5, 26.9]",1.301567,[9eee50d1-2956-4bd5-9417-4faa3cdcfec6],"{'statsbomb_xg': 0.023375953, 'end_location': ...",18245,,,
4,fe6c7f60-2ff0-4077-882e-b045c8abc7c3,628,1,00:13:02.999,13,2,"{'id': 16, 'name': 'Shot'}",28,"{'id': 24, 'name': 'Liverpool'}","{'id': 3, 'name': 'From Free Kick'}",...,"{'id': 3473, 'name': 'James Philip Milner'}","{'id': 15, 'name': 'Left Center Midfield'}","[101.2, 30.6]",0.054247,"[62488dda-14cf-4155-90c2-a4231a54de32, 70c57f0...","{'statsbomb_xg': 0.0437719, 'end_location': [1...",18245,,,


In [9]:
''' All columns of events dataframe.

Variables:-
events_df - DataFrame - events dataframe
'''
events_df.columns

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'player',
       'position', 'location', 'duration', 'related_events', 'shot',
       'match_id', 'under_pressure', 'out', 'off_camera'],
      dtype='object')

In [11]:
''' Inititalizing a finalised dataframe for baseline model.

Variables:-
events_df - DataFrame - events dataframe
'''

final_df = pd.DataFrame()
final_df['event_id'] = events_df['id']
final_df['player'] = events_df['player'].str['name']

In [12]:
print(f"UNIQUE SHOT TYPES:\n")
[t for t in events_df['shot'].str['type'].str['name'].unique()]

UNIQUE SHOT TYPES:



['Open Play', 'Free Kick', 'Penalty', 'Corner']

In [13]:
print(f"UNIQUE SHOT TECHNIQUES:\n")
[t for t in events_df['shot'].str['technique'].str['name'].unique()]

UNIQUE SHOT TECHNIQUES:



['Normal',
 'Volley',
 'Half Volley',
 'Overhead Kick',
 'Diving Header',
 'Backheel',
 'Lob']

In [14]:
print(f"UNIQUE FIRST TIME:\n")
[ft for ft in events_df['shot'].str['first_time'].unique()]

UNIQUE FIRST TIME:



[None, True]

In [15]:
''' Creating 6 additional features.

first_touch - Boolean - whether the shot taken was from first touch or not.
open_goal - Boolean - whether the goal post was not defended by at least one opponenet or was.
under_pressure - Boolean - whether the shot taker was put under pressure by opponenets or not.
shot_technique - category - technique of shooting
shot_body_part - category - body part used for taking the shot
statsbomb_xg - continuous - xG value provided by statsbomb.
'''
final_df['first_touch']=events_df['shot'].str['first_time'].fillna(False)
final_df['open_goal']=events_df['shot'].str['open_goal'].fillna(False)
final_df['under_pressure']=events_df['under_pressure'].fillna(False)
final_df['shot_technique']=events_df['shot'].str['technique'].str['name']
final_df['shot_body_part']=events_df['shot'].str['body_part'].str['name']
final_df['statsbomb_xg']=events_df['shot'].str['statsbomb_xg']

In [16]:
print(f"UNIQUE SHOT OUTCOMES:\n")
events_df['shot'].str['outcome'].str['name'].unique()

UNIQUE SHOT OUTCOMES:



array(['Blocked', 'Off T', 'Saved', 'Post', 'Goal', 'Wayward',
       'Saved to Post', 'Saved Off Target'], dtype=object)

In [17]:
''' Creating target feature: goal. A boolean feature indicating True if the shot was converted to a goal.

Variables:-
events_df - DataFrame - events dataframe
final_df - DataFrame - final dataframe for baseline model
'''

goal_mapping = {}
for outcome in events_df['shot'].str['outcome'].str['name'].unique():
    if outcome=='Goal':
        goal_mapping[outcome] = True
    else:
        goal_mapping[outcome] = False
        
final_df['goal'] = events_df['shot'].str['outcome'].str['name'].map(goal_mapping)

In [18]:
''' Feature Engineering: Aerial Shot.
    A shot is considered to be the aerial shot if the end location of goal above the ground.
    This is checked using x, y, z coordinates in events data. The z coordinate shows the distance from ground.
    If z coordinate is greated than 0, means the ball was in the air.
    
    Variables:-
    final_df - DataFrame - final dataframe for baseline model
'''

final_df['aerial_shot'] = np.where(events_df['shot'].str['end_location'].str[-1] > 0, True, False)

In [19]:
print("UNIQUE PLAY PATTERNS:\n")
[p for p in events_df.play_pattern.str['name'].unique()]

UNIQUE PLAY PATTERNS:



['From Throw In',
 'From Free Kick',
 'Regular Play',
 'From Corner',
 'From Goal Kick',
 'From Counter',
 'From Kick Off',
 'From Keeper',
 'Other']

In [20]:
''' Feature Engineering: Shot From Play.
    A feature telling us which type was play was going right before the shot event. The play pattern from above cell are the values of this feature.
    
    Variables:-
    from_play - Series - play_pattern column from events dataframe
    events_df - DataFrame - events dataframe
    final_df - DataFrame - final dataframe for baseline model
'''

from_play = events_df['play_pattern'].str['name'].str.partition('From ').drop(labels=1,axis=1)
final_df['from_play'] = from_play.iloc[:,0] + from_play.iloc[:,1]

In [21]:
''' Feature Engineering: 1v1. 
    The shot column in events dataframe is a dictionary, containing attributes of the shot. One such attribute is one_v_one.
    
    Variables:-
    events_df - DataFrame - events dataframe
'''

final_df['one_v_one']=events_df['shot'].str['one_on_one'].fillna(False)

In [22]:
final_df.head()

Unnamed: 0,event_id,player,first_touch,open_goal,under_pressure,shot_technique,shot_body_part,statsbomb_xg,goal,aerial_shot,from_play,one_v_one
0,682270cc-4bc4-4952-8f91-d3c5a704a691,Georginio Wijnaldum,False,False,False,Normal,Right Foot,0.036399,False,True,Throw In,False
1,9f5aa3eb-3bed-4bc0-97a5-bb8444b235b9,Mohamed Salah,True,False,True,Normal,Left Foot,0.027976,False,True,Free Kick,False
2,399ac143-5f7b-4080-8c0b-3c18435d7fc1,Roberto Firmino Barbosa de Oliveira,False,False,False,Normal,Head,0.075361,False,True,Regular Play,False
3,660d9d98-46b6-4b5e-9c9a-435d63142c93,Marcelo Vieira da Silva Júnior,False,False,False,Normal,Left Foot,0.023376,False,True,Free Kick,False
4,fe6c7f60-2ff0-4077-882e-b045c8abc7c3,James Philip Milner,True,False,False,Normal,Left Foot,0.043772,False,True,Free Kick,False


In [23]:
final_df.shape

(21777, 12)

### Final data for baseline model

The final dataframe contains 12 columns. Each row describes a shot event. There are total 21777 shots in the dataset.

# Export final DataFrame to Pickle file

In [24]:
''' Final dataframe is imported by baseline_model file to train a model from this data.

Variables:-
final_df - DataFrame - final dataframe for baseline model
FINAL_DF - string - path to the folder containing all pickle files
'''

pd.to_pickle(
    obj=final_df,
    filepath_or_buffer=os.path.join(FINAL_DF,'final_df.pkl')
)