In [435]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [306]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [307]:
if os.getcwd().split(os.sep)[-1] == "notebooks":
    os.chdir("../")

In [308]:
fname_events = os.path.join(os.getcwd(), "data//events_germany.parquet")
fname_players = os.path.join(os.getcwd(), "data//players.parquet")
fname_teams = os.path.join(os.getcwd(), "data//teams.parquet")

In [309]:
df_events = pd.read_parquet(fname_events)


In [310]:
df_teams = pd.read_parquet(fname_teams)

Add the name, favourite foot and position for each player

In [311]:
df_players = pd.read_parquet(fname_players)
df_players = df_players[["playerId", "shortName", "foot", "role.code2"]].copy()
df_players.rename(columns={"role.code2": "position"}, inplace=True)
df_events = pd.merge(df_events, df_players, how="left")

In [312]:
df_teams_per_match = df_events.groupby("matchId").agg(team1Id = ("teamId","min"), team2Id= ("teamId","max")).reset_index()
df_events = pd.merge(df_events, df_teams_per_match, how="left")

Add the team that is in the possession of the ball for each event

In [313]:
def compute_possession(row):
    if row["eventName"] in ["Pass", "Free Kick", "Others on the ball", "Shot", "Save attempt", "Goalkeeper leaving line"]:
        return row["teamId"]
    elif row["eventName"] == "Duel" and row["Accurate"] == 1:
        return row["teamId"]
    elif row["eventName"] == "Duel" and row["Accurate"] == 0: 
        if row["teamId"] == row["team1Id"]:
            return row["team2Id"]
        else: 
            return row["team1Id"]
    elif row["eventName"] in ["Foul", "Interruption", "Offside"]:
        return 0
    else:
        np.nan

In [314]:
df_events["teamPossession"] = df_events.apply(lambda row: compute_possession(row), axis=1)

## Build a first model for expected goals

In [416]:
df_shots = df_events[df_events["eventName"] == "Shot"].copy()

In [417]:
df_shots.head()

Unnamed: 0,eventId,subEventName,playerId,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,posBeforeX,posBeforeY,posAfterX,posAfterY,Goal,Own goal,Assist,Key pass,Counter attack,Left foot,Right foot,Head/body,Direct,Indirect,Dangerous ball lost,Blocked,High,Low,Interception,Clearance,Opportunity,Feint,Missed ball,Free space right,Free space left,Take on left,Take on right,Sliding tackle,Anticipated,Anticipation,Red card,Yellow card,Second yellow card,Position: Goal low center,Position: Goal low right,Position: Goal center,Position: Goal center left,Position: Goal low left,Position: Goal center right,Position: Goal high center,Position: Goal high left,Position: Goal high right,Position: Out low right,Position: Out center left,Position: Out low left,Position: Out center right,Position: Out high center,Position: Out high left,Position: Out high right,Position: Post low right,Position: Post center left,Position: Post low left,Position: Post center right,Position: Post high center,Position: Post high left,Position: Post high right,Through,Fairplay,Lost,Neutral,Won,Accurate,Not accurate,shortName,foot,position,team1Id,team2Id,teamPossession
104,10,Shot,209091,2516739,Shot,2444,1H,247.703507,100.0,179896573,83,66,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,C. Tolisso,right,MD,2444,2446,2444
178,10,Shot,134383,2516739,Shot,2444,1H,529.393731,100.0,179896639,95,59,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,N. Süle,right,DF,2444,2446,2444
216,10,Shot,105619,2516739,Shot,2446,1H,668.23434,100.0,179896684,91,66,100.0,100.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,A. Mehmedi,right,FW,2444,2446,2446
220,10,Shot,14786,2516739,Shot,2446,1H,672.92592,100.0,179896693,88,49,100.0,100.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,K. Bellarabi,right,MD,2444,2446,2446
313,10,Shot,20475,2516739,Shot,2444,1H,949.131592,100.0,179896798,74,42,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,A. Vidal,right,MD,2444,2446,2444


In [418]:
print(f"Total shots: {len(df_shots)}")
print(f"Expected goal when shooting: {df_shots['Goal'].mean()}")

Total shots: 6898
Expected goal when shooting: 0.1082922586256886


#### Head vs. foot

In [511]:
def from_dummy(df, cols, comb_col):
    df[comb_col] = np.nan
    for col in cols:
        df[comb_col] = np.where(df[col] == 1, col, df[comb_col])
    df[comb_col].fillna("Unknown", inplace=True)
    return df

In [562]:
def create_variable_graph(df, col, binned_cols=False):
    
    if binned_cols:
        
        diff_vals = sorted(df[col].unique())
        lst_x_title = list()
        for i in range(len(diff_vals)):
            if i == 0:
                lst_x_title.append(f"<={diff_vals[i]}")
            elif i == len(diff_vals)-1:
                lst_x_title.append(f">={diff_vals[i]}")
            else:
                lst_x_title.append(f"{diff_vals[i]} - {diff_vals[i+1]}")
    
    df_group = df.groupby(col).agg(total_count=(col, "count"),
                                   total_goal=("Goal","sum")).reset_index()
    df_group["share"] = df_group["total_count"] / len(df) * 100
    df_group["share_goal"] = df_group["total_goal"] / df_group["total_count"] * 100

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Bar(
            x=df_group[col],
            y=df_group["share"],
            name="Share of shots"
        ))


    fig.add_trace(
        go.Scatter(
            x=df_group[col],
            y=df_group["share_goal"],
            name="Prob. to score"
        ), secondary_y=True,)

    fig.update_layout(
        title=f'Distribution for {col}',
        yaxis=dict(
            title='Share shots (in %)',
            titlefont_size=16,
            tickfont_size=14,
        ),
        yaxis2=dict(
            title='Goal (in %)',
            titlefont_size=16,
            tickfont_size=14,
        ),    
    )
    
    if binned_cols:
        fig.data[0]["x"] = np.array(lst_x_title)
        fig.data[1]["x"] = np.array(lst_x_title)
    
    return fig

In [563]:
cols = ["Left foot", "Right foot", "Head/body"]
comb_col = "shotFoot"

In [564]:
df_shots = from_dummy(df_shots, cols, comb_col)

In [565]:
graph_shot_foot = create_variable_graph(df_shots, comb_col)
graph_shot_foot.show()

In [566]:
df_shots_close = df_shots[df_shots["posBeforeX"] > 100 - 16*100/105].copy()
graph_shot_close = create_variable_graph(df_shots_close, comb_col)

In [567]:
graph_shot_close.show()

In [568]:
def build_buckets(df, col, step_size, min_val=None, max_val=None):

    df = df.copy()

    if max_val is not None:
        df[col] = df[col].clip(upper=max_val)

    if min_val is not None:
        df[col] = df[col].clip(lower=min_val)

    df[col] = df[col].map(lambda x: int(x/step_size)*step_size)
    return df[col]


In [569]:
df_shots["posXBinned"] = build_buckets(df_shots, "posBeforeX", 3, 70, 97)

In [570]:
fig = create_variable_graph(df_shots, "posXBinned", binned_cols=True)
fig.show()

## Passes per game

In [335]:
df_pass_data = df_events[df_events["eventName"] == "Pass"].copy()

In [336]:
df_passes = df_pass_data.groupby("teamId").agg(totalPasses=("teamId", "count"), 
                                            matches=("matchId", "nunique"),
                                            accuratePasses=("Accurate", "sum")).reset_index()
df_passes["passesPerGame"] = df_passes["totalPasses"] / df_passes["matches"]
df_passes["passAccuracy"] = df_passes["accuratePasses"] / df_passes["totalPasses"]
df_passes.sort_values("passesPerGame", inplace=True, ascending=False)
df_passes = pd.merge(df_passes, df_teams[["teamId", "teamName"]], how="left")

In [337]:
fig = px.bar(df_passes, x="teamName", y="passesPerGame", 
             color="passAccuracy", 
             labels={'passesPerGame':"Passes Per Game", "teamName": "Team", "passAccuracy": "Pass Accuracy"})
fig.show()

In [338]:
df_pass_data["passLength"] = np.sqrt(np.square((df_pass_data["posBeforeY"] - df_pass_data["posAfterY"])*68/100) + np.square((df_pass_data["posBeforeX"] - df_pass_data["posAfterX"])*105/100))

In [356]:
df_pass_data["passLengthBin"] = df_pass_data["passLength"].map(lambda x: int(x/5)*5)
df_pass_data["passLengthBin"] = df_pass_data["passLengthBin"].clip(upper=60)

In [364]:
print(f"Average length of pass: {df_pass_data['passLength'].mean()}")

Average length of pass: 20.630867891994786


In [360]:
df_pass_length = df_pass_data.groupby("passLengthBin").agg(totalPasses=("passLengthBin", "count"),
                                                        accuratePasses=("Accurate", "sum")).reset_index()
df_pass_length["sharePasses"] = df_pass_length["totalPasses"] / len(df_pass_data) * 100
df_pass_length["shareAccurate"] = df_pass_length["accuratePasses"] / df_pass_length["totalPasses"] * 100

In [362]:
fig = px.bar(df_pass_length, x="passLengthBin", y="sharePasses", 
             color="shareAccurate", 
             labels={"passLengthBin": "Pass length (in m)", 
                     "sharePasses": "Share passes (in %)", 
                     "shareAccurate": "Pass Accuracy (in %)"})
fig.show()

In [365]:
df_length_team = df_pass_data.groupby("teamId").agg(meanPassLength=("passLength", "mean"),
                                                    medianPassLength=("passLength", "median")).reset_index()

In [367]:
df_length_team = pd.merge(df_length_team, df_teams[["teamId", "teamName"]], how="left")

In [369]:
df_length_team.sort_values("meanPassLength", inplace=True)

In [372]:
fig = px.bar(df_length_team, x="teamName", y="meanPassLength", 
             labels={'meanPassLength':"Avg.pass length (in m)", "teamName": "Team"})
fig.show()