In [1]:
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from kloppy.domain.models import Point
from typing import Callable, Sequence, Union
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from soccer_analytics.data.statsbomb import load_competition_seasons
from soccer_analytics.data_split import split_by_time
from soccer_analytics.preprocessing import match_list_to_df, get_block_score, uniform_block_score

In [2]:

matches = load_competition_seasons(
    "FIFA World Cup", ["2022"], event_types=["shot"]
)
train_test_matches, _ = split_by_time(matches, test_frac=0.2)
train_matches, test_matches = train_test_split(
    train_test_matches, test_size=0.2, random_state=235
)
len(train_matches)

40

In [3]:
df = match_list_to_df(train_matches)
df.freeze_frame.iloc[0][:5]

[{'location': [105.7, 38.8],
  'player': {'id': 33572, 'name': 'Nathaniel Atkinson'},
  'position': {'id': 2, 'name': 'Right Back'},
  'teammate': False},
 {'location': [107.8, 32.9],
  'player': {'id': 5481, 'name': 'Mathew Leckie'},
  'position': {'id': 12, 'name': 'Right Midfield'},
  'teammate': False},
 {'location': [107.1, 31.2],
  'player': {'id': 33477, 'name': 'Riley McGree'},
  'position': {'id': 15, 'name': 'Left Center Midfield'},
  'teammate': False},
 {'location': [98.2, 46.6],
  'player': {'id': 8346, 'name': 'Craig Goodwin'},
  'position': {'id': 16, 'name': 'Left Midfield'},
  'teammate': False},
 {'location': [113.4, 36.0],
  'player': {'id': 5490, 'name': 'Jackson Irvine'},
  'position': {'id': 13, 'name': 'Right Center Midfield'},
  'teammate': False}]

In [9]:
block_function = uniform_block_score(max_distance=0.5)
block_score = df.apply(
    lambda row: 0 if row.freeze_frame is None else get_block_score(
        Point(row.coordinates_x, row.coordinates_y), 
        row.freeze_frame, 
        block_function,
        overlap_strategy="compound"
    ),
    axis=1
)

In [10]:
is_blocked = []
raw_events = []
for match in train_matches:
    blocked = [1 * (event.result.value == "BLOCKED") for event in match]
    is_blocked += blocked
    raw_events += [event.raw_event for event in match]

In [11]:
block_info = pd.DataFrame({
    "is_blocked": is_blocked,
    "block_score": block_score.values,
    "raw_event": raw_events
})
block_info["score_bins"] = pd.qcut(block_info["block_score"], 10, labels=False, duplicates="drop")
block_info.describe()

Unnamed: 0,is_blocked,block_score,score_bins
count,865.0,865.0,865.0
mean,0.247399,0.358831,2.100578
std,0.43175,0.337299,2.168335
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.3231,1.0
75%,0.0,0.6498,4.0
max,1.0,0.97317,6.0


In [12]:
block_lift_df = block_info.groupby("score_bins").agg(
    num_shots=("is_blocked", "size"),
    fraction_blocked=("is_blocked", "mean"),
    average_block_score=("block_score", "mean"),
    min_block_score=("block_score", "min"),
    max_block_score=("block_score", "max")
)
block_lift_df

Unnamed: 0_level_0,num_shots,fraction_blocked,average_block_score,min_block_score,max_block_score
score_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,346,0.040462,0.011895,0.0,0.1269
1,87,0.16092,0.241946,0.1278,0.3231
2,86,0.197674,0.395932,0.3258,0.4707
3,86,0.372093,0.530274,0.4707,0.59562
4,87,0.425287,0.650937,0.5958,0.7065
5,86,0.546512,0.806112,0.70821,0.89262
6,87,0.609195,0.915095,0.8937,0.97317


In [13]:
roc_auc_score(block_info["is_blocked"], block_info["block_score"])

0.818715276282355

In [None]:
block_info[block_info["is_blocked"] == 0].sort_values("block_score", ascending=False).head(15)

In [None]:
i = 235
row = df.iloc[i]

score = get_block_score(
    Point(row.coordinates_x, row.coordinates_y),
    row.freeze_frame,
    block_function,
)
print(score)


fig, ax = plt.subplots(1, 1, figsize=(10, 7))

ax.plot([row.coordinates_x], [row.coordinates_y], color="blue", marker="o", ms=4)
for player_data in row.freeze_frame:
    color = "green"
    if player_data["position"]["name"] == "Goalkeeper":
        color = "orange"
    elif player_data["teammate"] is False:
        color = "red"
    ax.plot([player_data["location"][0]], [player_data["location"][1]], color=color, marker="o", ms=5)
    
ax.set_xlim(80, 120)
ax.set_ylim(62, 18)

In [None]:
block_info.iloc[i]["raw_event"]

## Unusual shots 
_(Note: minute markers refer to the minute that ticked over already, which is different from how time is normally discussed in soccer. Here, a block that occurred at 29 minutes and 30 seconds would be in the 29th minute, where normally it would be recorded as occurring in the 30th minute.)_

### Blocked shots that shouldn't have been

* iloc 245/722: A slower shot from a highly oblique angle is blocked by a player who is not in a blocking position when the shot is made but is in a blocking position by the time the ball gets to the goal. (US versus Netherlands, 74th minute; South Korea vs Ghana, 75th minute)
* iloc 747: Similar to the above, this one is a chip around the goalie that moves slow enough for a defender to get back in time to clear (Ghana vs Uruguay, 22nd minute)

For these shots, there is probably nothing that can be improved on based on these data. If there was player speed/direction data it might be possible to skew the blockable area calculation to account for that, but even then it's going to be very difficult to also account for whether or not the shot will be a slow one.

* iloc 9: A defender is very close to the shooter and is able to block the shot, even though the position data indicates that they are out of position (Australia vs France, 36th minute)
* iloc 682: There are several defenders right on the edge of the shot envelope, and the shot is from relatively far away. The shot ends up being basically right at one of the defenders who heads it down (Spain vs Japan, 79th minute)
* iloc 536: A shot from distance, likely trying to be curled into the top corner, is blocked by a player covering the right side of the shot paths. Basically the shot may not have been quite on goal when it was fired but would have potentially curled in (Wales vs England, 23rd minute)
* iloc 850: A shot from distance that was probably not going to be on target is blocked by a player not really standing in front of goal (Costa Rica vs Japan, 34th minute)
* iloc 737: Similar to the above (Wales vs US, 60th minute)

There may be ways to finesse the block score for these cases, e.g. raise the offset for players very close to the shooter or bump up the block score for cases where a shot is from far away and a defender is in front of the side of the net (ie covering an area that the long-range shooter must target in order to beat the goalkeeper from that distance). Or potentially extending the allowed shot angles when the shot is far from goal to account for the ball being curled or the shot being off-target.

* iloc 149: Similar to 682 with a few potential defenders in the way. In this case, however, the shot is not so much blocked as it is deflected and actually ends up **in the goal**. 

This seems like a data error.

### Unblocked shots that should have been blocked
* iloc 33: Free kick, shot over the net (Brazil vs Cameroon, 33rd minute)
* iloc 765: Free kick, shot saved (Ghana vs Uruguay, 99th minute)

Need to make sure that shot type is included

* iloc 482: Long shot with a bunch of defenders in the way, substantially off-target (Canada vs Croatia, 76th minute)

Probably nothing that can be done here, the shot's likely to miss anyway

* iloc 373: Jumping header off a corner gets over a bunch of nearby defenders

Maybe need to account for body part == head, or use the 'aerial_won' parameters