# Data Modeling 

In [43]:
#libraries required
import pandas as pd 
import numpy as np
import os
import sys

In [51]:
#globally set column display options
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [45]:
#load dataset
file_path = "../data/processed/shot_events_processed.csv"

df= pd.read_csv(file_path)

print(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns")

Dataset has 1494 rows and 23 columns


In [46]:
#confirm no missing values
print(df.isna().mean()*100)

#confirm data types
print(f"\n Data types in dataset \n{df.dtypes}")

location                 0.0
type_event               0.0
play_pattern             0.0
shot_xg                  0.0
shot_end_location        0.0
shot_type                0.0
shot_outcome             0.0
shot_technique           0.0
body_part                0.0
first_time_shot          0.0
1v1_shot                 0.0
under_pressure           0.0
aerial_shot_won          0.0
open_goal_shot           0.0
deflected_shot           0.0
shot_saved_to_post       0.0
shot_follows_dribble     0.0
shot_saved_off_target    0.0
shot_redirected          0.0
x_coordinate             0.0
y_coordinate             0.0
shot_distance            0.0
shot_angle               0.0
dtype: float64

 Data types in dataset 
location                  object
type_event                object
play_pattern              object
shot_xg                  float64
shot_end_location         object
shot_type                 object
shot_outcome              object
shot_technique            object
body_part                 obj

In [47]:
df.columns

Index(['location', 'type_event', 'play_pattern', 'shot_xg',
       'shot_end_location', 'shot_type', 'shot_outcome', 'shot_technique',
       'body_part', 'first_time_shot', '1v1_shot', 'under_pressure',
       'aerial_shot_won', 'open_goal_shot', 'deflected_shot',
       'shot_saved_to_post', 'shot_follows_dribble', 'shot_saved_off_target',
       'shot_redirected', 'x_coordinate', 'y_coordinate', 'shot_distance',
       'shot_angle'],
      dtype='object')

In [48]:
#change categorical features to type category: 

def typeToCategory(df,features):
    """Changes given features data type to category
        args:
        Dataframe and List of features

        Returns Dataframe    
    """
    if not isinstance(df, pd.DataFrame) or not isinstance(features,list):
        raise ValueError("Invalid input types. Expects Dataframe and list")
    
    try:
        for f in features:
            df[f] = df[f].astype('category')
        
        #validate changes 
        print(f"\nData types for categorical features : \n{df[features].dtypes}")
        return df
    except Exception as e:
        print(f"Error processing features:  {str(e)}")
        return None
        


In [49]:
#categorical features
cat_features =['shot_type','body_part','first_time_shot','1v1_shot','shot_outcome',
               'under_pressure','aerial_shot_won','open_goal_shot','deflected_shot',
               'shot_saved_to_post','shot_follows_dribble','shot_saved_off_target','shot_redirected',
               'play_pattern','shot_technique',
               ]
#Change to category
df_xg=typeToCategory(df,cat_features)


Data types for categorical features : 
shot_type                category
body_part                category
first_time_shot          category
1v1_shot                 category
shot_outcome             category
under_pressure           category
aerial_shot_won          category
open_goal_shot           category
deflected_shot           category
shot_saved_to_post       category
shot_follows_dribble     category
shot_saved_off_target    category
shot_redirected          category
play_pattern             category
shot_technique           category
dtype: object


In [52]:
#Feature selection
df_xg.head()

Unnamed: 0,location,type_event,play_pattern,shot_xg,shot_end_location,shot_type,shot_outcome,shot_technique,body_part,first_time_shot,1v1_shot,under_pressure,aerial_shot_won,open_goal_shot,deflected_shot,shot_saved_to_post,shot_follows_dribble,shot_saved_off_target,shot_redirected,x_coordinate,y_coordinate,shot_distance,shot_angle
0,"[92.6, 52.0]",Shot,From Goal Kick,0.02382,"[101.1, 47.5]",Open Play,Blocked,Normal,Right Foot,0,0,0,0,0,0,0,0,0,0,92.6,52.0,29.912539,0.223755
1,"[114.0, 54.8]",Shot,Regular Play,0.01406,"[114.6, 53.3]",Open Play,Blocked,Normal,Left Foot,0,0,0,0,0,0,0,0,0,0,114.0,54.8,15.969972,0.179792
2,"[93.4, 44.5]",Shot,From Throw In,0.033115,"[98.1, 43.9]",Open Play,Blocked,Normal,Right Foot,0,0,0,0,0,0,0,0,0,0,93.4,44.5,26.977954,0.266085
3,"[114.7, 29.6]",Shot,From Corner,0.043661,"[120.0, 43.4, 3.8]",Open Play,Off T,Normal,Head,0,0,0,0,0,0,0,0,0,0,114.7,29.6,11.672618,0.305878
4,"[115.3, 32.5]",Shot,From Corner,0.124033,"[116.0, 33.8]",Open Play,Blocked,Volley,Right Foot,1,0,0,0,0,0,0,0,0,0,115.3,32.5,8.850989,0.48716


The following features will not be included as predictors; 
1.  location - as is not useful in logistic regression model
2.  type_event - no variance in data type
3.  shot_xg - will be used for comparison of predicted value
4.  x_coordinate
5.  y_coordinate
6.  shot_end_location - as is not useful but can be engineered to derive where shot ended. (data leakage in training)
7.  shot_outcome - this is target value to be predicted (will convert to binary 1-goal, 0-no goal)

to further narrow the list


In [53]:
df_xg.columns

Index(['location', 'type_event', 'play_pattern', 'shot_xg',
       'shot_end_location', 'shot_type', 'shot_outcome', 'shot_technique',
       'body_part', 'first_time_shot', '1v1_shot', 'under_pressure',
       'aerial_shot_won', 'open_goal_shot', 'deflected_shot',
       'shot_saved_to_post', 'shot_follows_dribble', 'shot_saved_off_target',
       'shot_redirected', 'x_coordinate', 'y_coordinate', 'shot_distance',
       'shot_angle'],
      dtype='object')

In [56]:
df_xg.dtypes

play_pattern             category
shot_xg                   float64
shot_type                category
shot_outcome             category
shot_technique           category
body_part                category
first_time_shot          category
1v1_shot                 category
under_pressure           category
aerial_shot_won          category
open_goal_shot           category
deflected_shot           category
shot_saved_to_post       category
shot_follows_dribble     category
shot_saved_off_target    category
shot_redirected          category
shot_distance             float64
shot_angle                float64
dtype: object

In [58]:
df_xg= df_xg[['play_pattern','shot_type','shot_outcome', 'shot_technique',
       'body_part', 'first_time_shot', '1v1_shot', 'under_pressure',
       'aerial_shot_won', 'open_goal_shot', 'deflected_shot',
       'shot_saved_to_post', 'shot_follows_dribble', 'shot_saved_off_target',
       'shot_redirected','shot_distance',
       'shot_angle']]

In [59]:
numeric_features= df_xg.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Print numeric features to verify
print("Numeric features:", numeric_features)

#target feature
target = df_xg['shot_outcome']

# Get categorical features (non-numeric features)
categorical_features = df_xg.select_dtypes(include=['category']).columns.tolist()
print("\nCategorical features:", categorical_features)

features= [f for f in df_xg.columns if f != 'shot_outcome']
print(f"\n All features for modeling:", features)


Numeric features: ['shot_distance', 'shot_angle']

Categorical features: ['play_pattern', 'shot_type', 'shot_outcome', 'shot_technique', 'body_part', 'first_time_shot', '1v1_shot', 'under_pressure', 'aerial_shot_won', 'open_goal_shot', 'deflected_shot', 'shot_saved_to_post', 'shot_follows_dribble', 'shot_saved_off_target', 'shot_redirected']

 All features for modeling: ['play_pattern', 'shot_type', 'shot_technique', 'body_part', 'first_time_shot', '1v1_shot', 'under_pressure', 'aerial_shot_won', 'open_goal_shot', 'deflected_shot', 'shot_saved_to_post', 'shot_follows_dribble', 'shot_saved_off_target', 'shot_redirected', 'shot_distance', 'shot_angle']
