In [1]:
%matplotlib inline


# Radar plots

In this lesson, we go step-by-step through the process of making player radars
for a striker. We calculate the following metrics directly from
a count of actions in the Wyscout event data,

- Non-penalty goals
- Assists
- Key passes
- Smart passes
- Ariel duels won
- Ground attacking duels won

We add to these our own calculations of

- non-penalty expected goals
- passes ending in final third
- receptions in final third


In [2]:
import pandas as pd
import numpy as np
import json
# plotting
import matplotlib.pyplot as plt
# statistical fitting of models
import statsmodels.api as sm
import statsmodels.formula.api as smf
#opening data
import os
import pathlib
import warnings 
#used for plots
from scipy import stats
from mplsoccer import PyPizza, FontManager

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [3]:
PITCH_MAX_X = 105
PITCH_MAX_Y = 68
GOAL_WIDTH = 7.32

WYSCOUT_PITCH_MAX_X = 100
WYSCOUT_PITCH_MAX_Y = 100
WYSCOUT_BOX_X = 84
WYSCOUT_BOX_MIN_Y = 19
WYSCOUT_BOX_MAX_Y = 81

In [4]:
TRANSFORM_X = PITCH_MAX_X / WYSCOUT_PITCH_MAX_X
TRANSFORM_Y = PITCH_MAX_Y / WYSCOUT_PITCH_MAX_Y

BOX_X = WYSCOUT_BOX_X * TRANSFORM_X
BOX_MIN_Y = WYSCOUT_BOX_MIN_Y * TRANSFORM_Y
BOX_MAX_Y = WYSCOUT_BOX_MAX_Y * TRANSFORM_Y

## Opening data
For this task we will use Wyscout data. We open it, save in the dataframe
*events_df*. To avoid potential errors, we keep
only the data for which the beginning and end of an action was registered.

In [5]:
DATA_PATH = os.path.join(str(pathlib.Path().resolve().parents[1]), 'data', 'Wyscout')

In [6]:
league = 'England'
events_file_name = f'events_{league}.json'
events_path = os.path.join(DATA_PATH, 'events', events_file_name)

events_df = pd.read_json(events_path, encoding='unicode-escape')

# potential data collection error handling
events_df = events_df.loc[events_df.apply (lambda x: len(x.positions) == 2, axis = 1)]

In [7]:
def transform_x(row, i):
    return row[i]['x'] * TRANSFORM_X

def transform_y(row, i):
    return (WYSCOUT_PITCH_MAX_Y - row[i]['y']) * TRANSFORM_Y

events_df["x"] = events_df.positions.apply(lambda row: transform_x(row, 0))
events_df["y"] = events_df.positions.apply(lambda row: transform_y(row, 0))
events_df["end_x"] = events_df.positions.apply(lambda row: transform_x(row, 1))
events_df["end_y"] = events_df.positions.apply(lambda row: transform_y(row, 1))
events_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,x,y,end_x,end_y
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,51.45,34.68,32.55,14.96
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172,32.55,14.96,53.55,17.00
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,53.55,17.00,36.75,19.72
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,36.75,19.72,43.05,3.40
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,43.05,3.40,75.60,8.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643145,5,Ball out of the field,[],0,"[{'y': 32, 'x': 0}, {'y': 100, 'x': 100}]",2500098,Interruption,1623,2H,2796.732525,50,251596409,0.00,46.24,105.00,0.00
643146,3,Corner,"[{'id': 302}, {'id': 801}, {'id': 1801}]",70965,"[{'y': 100, 'x': 100}, {'y': 47, 'x': 88}]",2500098,Free Kick,1633,2H,2829.821084,30,251596232,105.00,0.00,92.40,36.04
643147,1,Air duel,"[{'id': 701}, {'id': 1802}]",7919,"[{'y': 53, 'x': 12}, {'y': 50, 'x': 14}]",2500098,Duel,1623,2H,2831.211419,10,251596410,12.60,31.96,14.70,34.00
643148,1,Air duel,"[{'id': 703}, {'id': 1801}]",8005,"[{'y': 47, 'x': 88}, {'y': 50, 'x': 86}]",2500098,Duel,1633,2H,2832.434399,10,251596234,92.40,36.04,90.30,34.00


## Minutes played
All data on our plot will be per 90 minutes played. Therefore, we need an information on the number of minutes played
throughout the season. To do so, we will use a prepared file that bases on the idea developed by students
taking part in course in 2021. Files with miutes per game for players in top 5 leagues can be found
[here](https://github.com/soccermatics/Soccermatics/tree/main/course/lessons/minutes_played). After downloading data and saving
it in out directory, we open it and store in a dataframe. Then we calculate the sum of miutes played in a season for each player.



In [8]:
minutes_path = os.path.join(DATA_PATH, 'minutes_played', f'minutes_played_per_game_{league}.json')
minutes_per_game = pd.read_json(minutes_path, encoding='unicode-escape')
minutes_per_game.head()

Unnamed: 0,playerId,shortName,matchId,teamId,teamName,player_in_min,player_out_min,minutesPlayed,red_card
0,9206,C. Wood,2500089,1646,Burnley FC,0,61,61,0
1,9127,S. Vokes,2500089,1646,Burnley FC,61,95,34,0
2,93,J. Guðmunds­son,2500089,1646,Burnley FC,0,80,80,0
3,9179,N. Wells,2500089,1646,Burnley FC,80,95,15,0
4,10108,K. Long,2500089,1646,Burnley FC,0,95,95,0


## Calculating possession
As the next step we would like to adjust our plot by the player's team ball possession while they 
were on the pitch. To do it, for each row of our dataframe with minutes per player per each game 
we take all the events that were made in this game while the player was on the pitch.

We will also use duels, but
don't include lost air duels and lost ground defending duels. Why? Possession is calculated as number of touches by team divided
by the number all touches. If a player lost ground defending duel, that means that he could have been dribbled by, so he did not
touch the ball. If they lost the air duel, they lost a header. Therefore, we claim that those were mostly events where player may have not
touched the ball (or if he did the team did not take control over it). We sum 
both team passes and these duels and all passes and these duels in this period. We store these values in a 
dictionary. Then, summing them for each player separately and calculating their ratio, we get 
the possession of the ball by player's team while he was on the pitch. As the last step we merge it with our summary dataframe.

In [9]:
possession_dict = {}
# for every row in the dataframe
for i, row in minutes_per_game.iterrows():
    # take player id, team id and match id, minute in and minute out
    player_id, team_id, match_id = row["playerId"], row["teamId"], row["matchId"]
    
    # create a key in dictionary if player encounterd first time
    player_key = (player_id, team_id)
    if player_key not in possession_dict:
        possession_dict[player_key] = {'team_passes': 0, 'all_passes' : 0}
    
    # get min in and out of the player
    min_in = row["player_in_min"] * 60
    min_out = row["player_out_min"] * 60
    
    # get the dataframe of events from the game
    match_df = events_df.loc[events_df["matchId"] == match_id].copy()
    
    # add to 2H the highest sec value of 1H
    match_df.loc[match_df["matchPeriod"] == "2H", 'eventSec'] = match_df.loc[match_df["matchPeriod"] == "2H", 'eventSec'] + match_df.loc[match_df["matchPeriod"] == "1H"]["eventSec"].iloc[-1]
    
    # take all events from this game and this period
    player_in_match_df = match_df.loc[match_df["eventSec"] > min_in].loc[match_df["eventSec"] <= min_out]
    
    # take all passes and won duels as described
    all_passes = player_in_match_df.loc[player_in_match_df["eventName"].isin(["Pass", "Duel"])]
    
    # adjusting for no passes in this period (Tuanzebe)
    if len(all_passes) > 0:
        # removing lost air duels
        no_contact = all_passes.loc[all_passes["subEventName"].isin(["Air duel", "Ground defending duel","Ground loose ball duel"])].loc[all_passes.apply(lambda x:{'id':701} in x.tags, axis = 1)]
        all_passes = all_passes.drop(no_contact.index)
    
    # take team passes 
    team_passes = all_passes.loc[all_passes["teamId"] == team_id]
    # append it {(player id, team id): {team passes: sum, all passes : sum}}
    possession_dict[player_key]["team_passes"] += len(team_passes)
    possession_dict[player_key]["all_passes"] += len(all_passes)

possession_dict

{(9206, 1646): {'team_passes': 8653, 'all_passes': 19358},
 (9127, 1646): {'team_passes': 5097, 'all_passes': 11610},
 (93, 1646): {'team_passes': 14957, 'all_passes': 34126},
 (9179, 1646): {'team_passes': 378, 'all_passes': 802},
 (10108, 1646): {'team_passes': 7420, 'all_passes': 17123},
 (8433, 1646): {'team_passes': 13085, 'all_passes': 29551},
 (8125, 1646): {'team_passes': 18037, 'all_passes': 41079},
 (9433, 1646): {'team_passes': 14990, 'all_passes': 33575},
 (8980, 1646): {'team_passes': 12531, 'all_passes': 28448},
 (8643, 1646): {'team_passes': 12072, 'all_passes': 27557},
 (12242, 1646): {'team_passes': 16477, 'all_passes': 37717},
 (8925, 1646): {'team_passes': 6097, 'all_passes': 13405},
 (8284, 1646): {'team_passes': 5989, 'all_passes': 13253},
 (532949, 1646): {'team_passes': 2, 'all_passes': 2},
 (259531, 1659): {'team_passes': 361, 'all_passes': 717},
 (7989, 1659): {'team_passes': 11513, 'all_passes': 25268},
 (245813, 1659): {'team_passes': 3784, 'all_passes': 7733

In [12]:
# Create a list of dictionaries to store the data
data_list = []

# Calculate the possession percentage and store it in a list of dictionaries
for key, value in possession_dict.items():
    player_id, team_id = key
    team_passes = value["team_passes"]
    all_passes = value["all_passes"]
    
    possession_percentage = team_passes / all_passes if all_passes > 0 else 0
    
    data_list.append({"playerId": player_id, "teamId": team_id, "possession": possession_percentage})

# Create a DataFrame from the list of dictionaries
percentage_df = pd.DataFrame(data_list)
percentage_df.to_json(os.path.join(DATA_PATH, 'player_possession', f'player_possession_{league}.json'))
percentage_df.head()

Unnamed: 0,playerId,teamId,possession
0,9206,1646,0.446999
1,9127,1646,0.439018
2,93,1646,0.438288
3,9179,1646,0.471322
4,10108,1646,0.433335
