In [305]:
import pandas as pd
import numpy as np
import os
import plotly.express as px

In [306]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [307]:
if os.getcwd().split(os.sep)[-1] == "notebooks":
    os.chdir("../")

In [308]:
fname_events = os.path.join(os.getcwd(), "data//events_germany.parquet")
fname_players = os.path.join(os.getcwd(), "data//players.parquet")
fname_teams = os.path.join(os.getcwd(), "data//teams.parquet")

In [309]:
df_events = pd.read_parquet(fname_events)


In [310]:
df_teams = pd.read_parquet(fname_teams)

Add the name, favourite foot and position for each player

In [311]:
df_players = pd.read_parquet(fname_players)
df_players = df_players[["playerId", "shortName", "foot", "role.code2"]].copy()
df_players.rename(columns={"role.code2": "position"}, inplace=True)
df_events = pd.merge(df_events, df_players, how="left")

In [312]:
df_teams_per_match = df_events.groupby("matchId").agg(team1Id = ("teamId","min"), team2Id= ("teamId","max")).reset_index()
df_events = pd.merge(df_events, df_teams_per_match, how="left")

Add the team that is in the possession of the ball for each event

In [313]:
def compute_possession(row):
    if row["eventName"] in ["Pass", "Free Kick", "Others on the ball", "Shot", "Save attempt", "Goalkeeper leaving line"]:
        return row["teamId"]
    elif row["eventName"] == "Duel" and row["Accurate"] == 1:
        return row["teamId"]
    elif row["eventName"] == "Duel" and row["Accurate"] == 0: 
        if row["teamId"] == row["team1Id"]:
            return row["team2Id"]
        else: 
            return row["team1Id"]
    elif row["eventName"] in ["Foul", "Interruption", "Offside"]:
        return 0
    else:
        np.nan

In [314]:
df_events["teamPossession"] = df_events.apply(lambda row: compute_possession(row), axis=1)

Passes per game

In [335]:
df_pass_data = df_events[df_events["eventName"] == "Pass"].copy()

In [336]:
df_passes = df_pass_data.groupby("teamId").agg(totalPasses=("teamId", "count"), 
                                            matches=("matchId", "nunique"),
                                            accuratePasses=("Accurate", "sum")).reset_index()
df_passes["passesPerGame"] = df_passes["totalPasses"] / df_passes["matches"]
df_passes["passAccuracy"] = df_passes["accuratePasses"] / df_passes["totalPasses"]
df_passes.sort_values("passesPerGame", inplace=True, ascending=False)
df_passes = pd.merge(df_passes, df_teams[["teamId", "teamName"]], how="left")

In [337]:
fig = px.bar(df_passes, x="teamName", y="passesPerGame", 
             color="passAccuracy", 
             labels={'passesPerGame':"Passes Per Game", "teamName": "Team", "passAccuracy": "Pass Accuracy"})
fig.show()

In [338]:
df_pass_data["passLength"] = np.sqrt(np.square((df_pass_data["posBeforeY"] - df_pass_data["posAfterY"])*68/100) + np.square((df_pass_data["posBeforeX"] - df_pass_data["posAfterX"])*105/100))

In [356]:
df_pass_data["passLengthBin"] = df_pass_data["passLength"].map(lambda x: int(x/5)*5)
df_pass_data["passLengthBin"] = df_pass_data["passLengthBin"].clip(upper=60)

In [364]:
print(f"Average length of pass: {df_pass_data['passLength'].mean()}")

Average length of pass: 20.630867891994786


In [360]:
df_pass_length = df_pass_data.groupby("passLengthBin").agg(totalPasses=("passLengthBin", "count"),
                                                        accuratePasses=("Accurate", "sum")).reset_index()
df_pass_length["sharePasses"] = df_pass_length["totalPasses"] / len(df_pass_data) * 100
df_pass_length["shareAccurate"] = df_pass_length["accuratePasses"] / df_pass_length["totalPasses"] * 100

In [362]:
fig = px.bar(df_pass_length, x="passLengthBin", y="sharePasses", 
             color="shareAccurate", 
             labels={"passLengthBin": "Pass length (in m)", 
                     "sharePasses": "Share passes (in %)", 
                     "shareAccurate": "Pass Accuracy (in %)"})
fig.show()

In [365]:
df_length_team = df_pass_data.groupby("teamId").agg(meanPassLength=("passLength", "mean"),
                                                    medianPassLength=("passLength", "median")).reset_index()

In [367]:
df_length_team = pd.merge(df_length_team, df_teams[["teamId", "teamName"]], how="left")

In [369]:
df_length_team.sort_values("meanPassLength", inplace=True)

In [372]:
fig = px.bar(df_length_team, x="teamName", y="meanPassLength", 
             labels={'meanPassLength':"Avg.pass length (in m)", "teamName": "Team"})
fig.show()