In [None]:
# Load the opta functions from the kloppy module
# Load in "pandas as pd" as a shorthand for the pandas module - which is used later on to work with dataframes
# "pandas as pd" - is a convention in python

from kloppy import opta
import pandas as pd

# Using the opta.load() function you can now load in the event data without working with troublesome XML files
# This time you can add additional columns to the to_pandas() function to get player and team names added as columns

dataset = opta.load(
    f7_data="C:\\Users\\YOUR_USER\\Desktop\\Github\\DFDA\\Data\\Event Data\\F7.xml",
    f24_data="C:\\Users\\YOUR_USER\\Desktop\\Github\\DFDA\\Data\\Event Data\\F24.xml",
    coordinates="opta"
    
).to_pandas(
    additional_columns={
        'player': lambda event: str(event.player),
        'team': lambda event: str(event.team)
    }
)

dataset.head()

In [None]:
# Focussing on shots - you can again subset the dataset by using the following:
# DATASET_NAME.loc[DATASET_NAME[COLUMN_NAME]=='TEXT_VALUE']

shots=dataset.loc[dataset['event_type']=='SHOT']

print("Total shots:",len(shots.index))

# If you only want to focus on one team you can add a further filter by adding & and enclosing all filters in (...)

shots_vejle=dataset.loc[(dataset['event_type']=='SHOT') & (dataset['team']=='Vejle BK')]

print("Vejle BK shots:",len(shots_vejle.index))

In [None]:
# Alternative you can use a group by statement

print(shots.groupby('team').size())

In [None]:
# Turning to passes you can quickly use groupby to show which players has the most passes
# Begin by creating a data frame with passes

passes=dataset.loc[dataset['event_type']=='PASS']

# Then create a dataframe with number of passes per player on each team

passes = pd.DataFrame(passes.groupby(['player','team']).size())

# Then rename the unnamed "size column" to passes and lastly the dataframe is sorted in a descending order

passes = passes.rename(columns={0:"Passes"}).sort_values("Passes", ascending=False)

# Again you can use the head() or tail() function to show the top 5 - as it is ordered descending - or bottom 5

passes.head()

In [None]:
# Another metric which can be derived from passes is passes INSIDE the last 1/3 of the pitch
# Starting with the same idea of subsetting the dataset but including a x_coordinate filter
# Opta coordinates goes from X=0,Y=0 (Right corner of own backline) to X=100,Y=100 (Left corner on opponents backline)
# Hence we use X>100*2/3 to get events inside last 1/3

passes_last_third=dataset.loc[(dataset['event_type']=='PASS') & (dataset['coordinates_x']>100*2/3) & (dataset['end_coordinates_x']>100*2/3)]

# Again this statement creates a dataframe with number of passes per player on each team

passes_last_third = pd.DataFrame(passes_last_third.groupby(['player','team']).size())

# Then the unnamed size column is renamed to passes and lastly the dataframe is sorted in a descending order

passes_last_third = passes_last_third.rename(columns={0:"Passes Inside Last 1/3"}).sort_values("Passes Inside Last 1/3", ascending=False)

# You can then show top 10 by using head(10)

passes_last_third.head(10)