In [1]:
import pandas as pd
from datetime import timedelta
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/AsianFootballAnalysisZone/MainZone/main/ISL%2021-22.csv')

# Change timestamp to a numberical timestamp so we can calculate the seconds between events
df.timestamp = pd.to_datetime(df.timestamp, format="%H:%M:%S.%f")

# Reverse the pass angle so we can more easily calculate back passes
df.pass_angle = df.pass_angle*-1

# Statsbomb denote pressuring/tightly marking receivers with a pressure event between the pass & receipt events
# So we'll gather info of that last event - player, timestamp, etc.
df['last_event'] = df.type_name.shift(1)
df['pressing_player'] = df.player_id.shift(1) # Use Player ID so we don't run into same-name issues (Manvir Singh for example)
df['pressing_player_position'] = df.position_name.shift(1)
df['last_timestamp'] = df.timestamp.shift(1)

# Looking through the data, the next 3 events will include what we need. Often the first event after a "successful" receipt is a carry
# If a player continues to mark/press, sometimes even a successful receipt is then dispossessed eithin a few tenths of a second
# That's why we're doing this study, and all these events happen within the next 3 events.
# So we'll see pass-pressure-receipt-carry-dispossessed-duel many times. We're looking for that 'dispossessed' or 'duel'
# The theory is that a "successful" pressure will halt up the receiving player. Even if just for a few seconds, that's beneficial
df['next_event'] = df.type_name.shift(-1)
df['next_event_2'] = df.type_name.shift(-2)
df['next_event_3'] = df.type_name.shift(-3)

# Next 3 events players
df['next_event_player'] = df.player_id.shift(-1)
df['next_event_2_player'] = df.player_id.shift(-2)
df['next_event_3_player'] = df.player_id.shift(-3)

# Timestamps of next 3 events
df['next_event_timestamp'] = df.timestamp.shift(-1)
df['next_event_2_timestamp'] = df.timestamp.shift(-2)
df['next_event_3_timestamp'] = df.timestamp.shift(-3)

# Pass angles of next 3 events
df['next_event_pass_angle'] = df.pass_angle.shift(-1)
df['next_event_2_pass_angle'] = df.pass_angle.shift(-2)
df['next_event_3_pass_angle'] = df.pass_angle.shift(-3)

# Pass outcomes of next 3 events
df['next_event_outcome_name'] = df.outcome_name.shift(-1)
df['next_event_2_outcome_name'] = df.outcome_name.shift(-2)
df['next_event_3_outcome_name'] = df.outcome_name.shift(-3)


## Our df we'll filter. Only includes receipts where the receiver was being pressed/tightly marked (under_pressure == 1)
## Or when the receiver wasn't being actively touched at the time but the presser is still pressing (under_pressuer.isna())

# rec_under_press_df = df[(df.under_pressure == 1) & (df.type_name == 'Ball Receipt') & (df.last_event == 'Pressure')].reset_index(drop=True)
# rec_under_press_df = df[(df.under_pressure.isna()) & (df.type_name == 'Ball Receipt') & (df.last_event == 'Pressure')].reset_index(drop=True)
rec_under_press_df = df[(df.type_name == 'Ball Receipt') & (df.last_event == 'Pressure')].reset_index(drop=True)

# These are the events that indicate some sort of successful pressure/holding up of the receiver
possible_events = ['Dispossessed', 'Miscontrol', 'Block', 'Duel']

# Initialize a few more columns we'll need
rec_under_press_df['next_event_code'] = 0
rec_under_press_df['next_event_2_code'] = 0
rec_under_press_df['next_event_3_code'] = 0
rec_under_press_df['timechange_next_event'] = 0.0
rec_under_press_df['timechange_next_event_2'] = 0.0
rec_under_press_df['timechange_next_event_3'] = 0.0
rec_under_press_df['next_player_same_as_pressing'] = 0
rec_under_press_df['next_2_player_same_as_pressing'] = 0
rec_under_press_df['next_3_player_same_as_pressing'] = 0

# Iterate the main part... calculating everything and tagging successful pressures
for i in range(len(rec_under_press_df)):
    # These lines say, if any of the next 3 events is an event in out possible_event list, tag it
    if rec_under_press_df.next_event[i] in possible_events:
        rec_under_press_df.next_event_code[i] = 1
    if rec_under_press_df.next_event_2[i] in possible_events:
        rec_under_press_df.next_event_2_code[i] = 1
    if rec_under_press_df.next_event_3[i] in possible_events:
        rec_under_press_df.next_event_3_code[i] = 1
        
    if rec_under_press_df.next_event[i] in 'Pass':
        rec_under_press_df.next_event_code[i] = 2
    if rec_under_press_df.next_event_2[i] in 'Pass':
        rec_under_press_df.next_event_2_code[i] = 2
    if rec_under_press_df.next_event_3[i] in 'Pass':
        rec_under_press_df.next_event_3_code[i] = 2
    
    # These lines calculate the time, in seconds, from the pressure event (same as receipt event) to the next 3 events
    rec_under_press_df.timechange_next_event[i] = (rec_under_press_df.next_event_timestamp[i]- rec_under_press_df.last_timestamp[i]).total_seconds()
    rec_under_press_df.timechange_next_event_2[i] = (rec_under_press_df.next_event_2_timestamp[i]- rec_under_press_df.last_timestamp[i]).total_seconds()
    rec_under_press_df.timechange_next_event_3[i] = (rec_under_press_df.next_event_3_timestamp[i]- rec_under_press_df.last_timestamp[i]).total_seconds()
    
    # This tags if any of the next 3 events after receipt are performed by pressing player
    if rec_under_press_df.pressing_player[i] == rec_under_press_df.next_event_player[i]:
        rec_under_press_df.next_player_same_as_pressing[i] = 1
    if rec_under_press_df.pressing_player[i] == rec_under_press_df.next_event_2_player[i]:
        rec_under_press_df.next_2_player_same_as_pressing[i] = 1
    if rec_under_press_df.pressing_player[i] == rec_under_press_df.next_event_3_player[i]:
        rec_under_press_df.next_3_player_same_as_pressing[i] = 1

# This makes a df of all the initially successful pressures, i.e. unsuccessful ball receipts
suc_initial_press_df = rec_under_press_df[~rec_under_press_df.outcome_name.isna()]
unsuc_initial_press_df = rec_under_press_df[rec_under_press_df.outcome_name.isna()]

# This makes a df of the pressing events that are successful in the next 3 events, but unsuccessful initially
suc_press_df = unsuc_initial_press_df[((unsuc_initial_press_df.next_event_code == 1) | (unsuc_initial_press_df.next_event_2_code == 1) | (unsuc_initial_press_df.next_event_3_code == 1)) &
                                 ((unsuc_initial_press_df.next_player_same_as_pressing == 1) | (unsuc_initial_press_df.next_2_player_same_as_pressing == 1) | (unsuc_initial_press_df.next_3_player_same_as_pressing == 1))]

# Now we filter out to only show the successful non-initial pressures within 2 seconds. This could be adjusted of course. Almost 90% of all 3-event pressures happen within 2 seconds though
suc_press_df = suc_press_df[suc_press_df.timechange_next_event_3 <= 2] ### THIS IS WHERE TO CHANGE SECONDS

# Calculate the sums of all players. Disregard position they're in at the time
initial_press_players = suc_initial_press_df.groupby(['pressing_player'])['id'].count().reset_index().sort_values(by=['id'],ascending=False)
secondary_press_players = suc_press_df.groupby(['pressing_player'])['id'].count().reset_index().sort_values(by=['id'],ascending=False)
total_press_by_player = rec_under_press_df.groupby(['pressing_player'])['id'].count().reset_index().sort_values(by=['id'],ascending=False)
initial_press_players.rename(columns={'id':'successful_initial_pressure'},inplace=True)
secondary_press_players.rename(columns={'id':'successful_2second_pressure'},inplace=True)
total_press_by_player.rename(columns={'id':'total_pressures'},inplace=True)

# Now we separate the pressing players by position. This way we can compare much better, so we don't compare a striker to a CB for instance
initial_press_players_position = suc_initial_press_df.groupby(['pressing_player','pressing_player_position'])['id'].count().reset_index()
secondary_press_players_position = suc_press_df.groupby(['pressing_player','pressing_player_position'])['id'].count().reset_index()
total_press_by_player_position = rec_under_press_df.groupby(['pressing_player','pressing_player_position'])['id'].count().reset_index()
initial_press_players_position.rename(columns={'id':'successful_initial_pressure'},inplace=True)
secondary_press_players_position.rename(columns={'id':'successful_2second_pressure'},inplace=True)
total_press_by_player_position.rename(columns={'id':'total_pressures'},inplace=True)

# Now drop all the successful pressure events (initial or 2 second) from the main df we first created
rec_under_press_df = rec_under_press_df.drop(suc_initial_press_df.index)
rec_under_press_df = rec_under_press_df.drop(suc_press_df.index)

# Create a pass pressure df
pass_df = rec_under_press_df[(rec_under_press_df.next_event_code == 2) | (rec_under_press_df.next_event_2_code == 2) | (rec_under_press_df.next_event_3_code == 2)].reset_index(drop=True)
pass_df['next_pass_outcome_name'] = ''
pass_df['next_pass_angle'] = 0.0
pass_df['timechange_next_pass'] = 0.0

# We only need the info for the first pass. In the case of multiple passes in the next 3 events, we can't use the second!
for i in range(len(pass_df)):
    if pass_df.next_event_code[i] == 2:
        pass_df.next_pass_outcome_name[i] = pass_df.next_event_outcome_name[i]
        pass_df.next_pass_angle[i] = pass_df.next_event_pass_angle[i]
        pass_df.timechange_next_pass[i] = pass_df.timechange_next_event[i]
    elif pass_df.next_event_2_code[i] == 2:
        pass_df.next_pass_outcome_name[i] = pass_df.next_event_2_outcome_name[i]
        pass_df.next_pass_angle[i] = pass_df.next_event_2_pass_angle[i]
        pass_df.timechange_next_pass[i] = pass_df.timechange_next_event_2[i]
    elif pass_df.next_event_3_code[i] == 2:
        pass_df.next_pass_outcome_name[i] = pass_df.next_event_3_outcome_name[i]
        pass_df.next_pass_angle[i] = pass_df.next_event_3_pass_angle[i]
        pass_df.timechange_next_pass[i] = pass_df.timechange_next_event_3[i]

# Filter to only show passes made in the same 2 second limit we have for pressures above
pass_df = pass_df[pass_df.timechange_next_pass <= 2]
pass_df.next_pass_outcome_name = pass_df.next_pass_outcome_name.fillna('Successful')

# We will call a successful pressure event one that forces the opponent into an incomplete pass or a backward pass, as that stops progression too
suc_press_pass_df = pass_df[(pass_df.next_pass_outcome_name.str.contains('Incomplete|Out')) | pass_df.next_pass_angle.between((-np.pi/4),np.pi/4)]

# Calculate the sums of all players. Disregard position they're in at the time
pass_press_players = suc_press_pass_df.groupby(['pressing_player'])['id'].count().reset_index().sort_values(by=['id'],ascending=False)
pass_press_players.rename(columns={'id':'successful_pass_pressure'},inplace=True)

# Now we separate the pressing players by position. This way we can compare much better, so we don't compare a striker to a CB for instance
pass_press_players_position = suc_press_pass_df.groupby(['pressing_player', 'pressing_player_position'])['id'].count().reset_index().sort_values(by=['id'],ascending=False)
pass_press_players_position.rename(columns={'id':'successful_pass_pressure'},inplace=True)



In [3]:
# Calculate all the pressures & success rates by player
pressures_analysis_df = total_press_by_player.append(secondary_press_players).append(initial_press_players).append(pass_press_players).groupby(['pressing_player'])['total_pressures', 'successful_initial_pressure', 'successful_2second_pressure', 'successful_pass_pressure'].sum().reset_index()
pressures_analysis_df['%_successful_initial'] = pressures_analysis_df.successful_initial_pressure / pressures_analysis_df.total_pressures
pressures_analysis_df['%_successful_2second'] = pressures_analysis_df.successful_2second_pressure / pressures_analysis_df.total_pressures
pressures_analysis_df['%_successful_pass'] = pressures_analysis_df.successful_pass_pressure / pressures_analysis_df.total_pressures
pressures_analysis_df['%_successful_total'] = (pressures_analysis_df.successful_initial_pressure + pressures_analysis_df.successful_2second_pressure + pressures_analysis_df.successful_pass_pressure) / pressures_analysis_df.total_pressures

# Replace player ID with name - team (in case of player moving during season, team they started season with)
for i in range(len(pressures_analysis_df)):
    pressures_analysis_df.pressing_player[i] = df[df.player_id == int(pressures_analysis_df.pressing_player[i])].player_name.values[0] + ' - ' + df[df.player_id == int(pressures_analysis_df.pressing_player[i])].team_name.values[0]

# Download the csv in case you want to play with it outside python
pressures_analysis_df.to_csv('ISL 21-22 Pressing Receiver.csv', encoding='utf-8-sig')

# Calculate all the pressures & success rates by player & position
pressures_analysis_position_df = total_press_by_player_position.append(secondary_press_players_position).append(initial_press_players_position).append(pass_press_players_position).groupby(['pressing_player', 'pressing_player_position'])['total_pressures', 'successful_initial_pressure', 'successful_2second_pressure', 'successful_pass_pressure'].sum().reset_index()
pressures_analysis_position_df['%_successful_initial'] = pressures_analysis_position_df.successful_initial_pressure / pressures_analysis_position_df.total_pressures
pressures_analysis_position_df['%_successful_2second'] = pressures_analysis_position_df.successful_2second_pressure / pressures_analysis_position_df.total_pressures
pressures_analysis_position_df['%_successful_pass'] = pressures_analysis_position_df.successful_pass_pressure / pressures_analysis_position_df.total_pressures
pressures_analysis_position_df['%_successful_total'] = (pressures_analysis_position_df.successful_initial_pressure + pressures_analysis_position_df.successful_2second_pressure + pressures_analysis_position_df.successful_pass_pressure) / pressures_analysis_position_df.total_pressures

# Replace player ID with name - team (in case of player moving during season, team they started season with)
for i in range(len(pressures_analysis_position_df)):
    pressures_analysis_position_df.pressing_player[i] = df[df.player_id == int(pressures_analysis_position_df.pressing_player[i])].player_name.values[0] + ' - ' + df[df.player_id == int(pressures_analysis_position_df.pressing_player[i])].team_name.values[0]

# Download the csv in case you want to play with it outside python
pressures_analysis_position_df.to_csv('ISL 21-22 Pressing Receiver Positions.csv', encoding='utf-8-sig')



In [4]:
# View the top 10 players with at least the median number of pressures, sorted by total success rate

pressures_analysis_df.describe() # This will tell us the median number of pressures is 28

pressures_analysis_df[pressures_analysis_df['total_pressures']>=28].sort_values(by=['%_successful_total'],ascending=False).head(10)


Unnamed: 0,pressing_player,total_pressures,successful_initial_pressure,successful_2second_pressure,successful_pass_pressure,%_successful_initial,%_successful_2second,%_successful_pass,%_successful_total
65,Bipin Singh Thounajam - Mumbai City,49.0,18.0,6.0,15.0,0.367347,0.122449,0.306122,0.795918
45,Manvir Singh - ATK Mohun Bagan,40.0,19.0,4.0,8.0,0.475,0.1,0.2,0.775
25,Jorge Rolando Pereyra Díaz - Kerala Blasters,44.0,15.0,5.0,14.0,0.340909,0.113636,0.318182,0.772727
102,Devendra Dhaku Murgaokar - Goa,41.0,19.0,5.0,7.0,0.463415,0.121951,0.170732,0.756098
128,Thongkhosiem Haokip - East Bengal,35.0,14.0,1.0,10.0,0.4,0.028571,0.285714,0.714286
125,Antonio Perošević - East Bengal,37.0,21.0,2.0,3.0,0.567568,0.054054,0.081081,0.702703
14,Álvaro Vázquez García - Kerala Blasters,36.0,12.0,2.0,10.0,0.333333,0.055556,0.277778,0.666667
160,Suhair Vadakkepeedika - NorthEast United,107.0,34.0,7.0,27.0,0.317757,0.065421,0.252336,0.635514
77,Mirlan Murzaev - Chennaiyin,38.0,5.0,4.0,15.0,0.131579,0.105263,0.394737,0.631579
97,Seriton Fernandes - Goa,38.0,11.0,2.0,11.0,0.289474,0.052632,0.289474,0.631579


In [5]:
### Now let's get into some positional breakdowns

# Show all StatsBomb positions. value 0 in this list is 'nan', as it's the tag for non-player events like tactical changes
sorted(df.position_name.unique().tolist()[1:])

['Center Attacking Midfield',
 'Center Back',
 'Center Defensive Midfield',
 'Center Forward',
 'Goalkeeper',
 'Left Attacking Midfield',
 'Left Back',
 'Left Center Back',
 'Left Center Forward',
 'Left Center Midfield',
 'Left Defensive Midfield',
 'Left Midfield',
 'Left Wing',
 'Left Wing Back',
 'Right Attacking Midfield',
 'Right Back',
 'Right Center Back',
 'Right Center Forward',
 'Right Center Midfield',
 'Right Defensive Midfield',
 'Right Midfield',
 'Right Wing',
 'Right Wing Back']

In [6]:
### Center Back Analysis

# Just filter the show all players who have made pressures while in a CB position
cb_analysis = pressures_analysis_position_df[pressures_analysis_position_df['pressing_player_position'].str.contains('Center Back')].copy()
cb_analysis = cb_analysis.groupby(['pressing_player'])['total_pressures','successful_initial_pressure','successful_2second_pressure','successful_pass_pressure'].sum().reset_index()
cb_analysis['%_successful_initial'] = cb_analysis.successful_initial_pressure / cb_analysis.total_pressures
cb_analysis['%_successful_2second'] = cb_analysis.successful_2second_pressure / cb_analysis.total_pressures
cb_analysis['%_successful_pass'] = cb_analysis.successful_pass_pressure / cb_analysis.total_pressures
cb_analysis['%_successful_total'] = (cb_analysis.successful_initial_pressure + cb_analysis.successful_2second_pressure + cb_analysis.successful_pass_pressure) / cb_analysis.total_pressures

# Filter to just show the players at or above the median
cb_analysis[cb_analysis['total_pressures']>=cb_analysis.describe().loc['50%','total_pressures']].sort_values(by=['%_successful_total'],ascending=False).head()



Unnamed: 0,pressing_player,total_pressures,successful_initial_pressure,successful_2second_pressure,successful_pass_pressure,%_successful_initial,%_successful_2second,%_successful_pass,%_successful_total
20,Hernán Daniel Santana Trujillo - NorthEast United,20.0,1.0,6.0,4.0,0.05,0.3,0.2,0.55
7,Anwar Ali - Goa,32.0,1.0,7.0,9.0,0.03125,0.21875,0.28125,0.53125
25,José Luis Espinosa Arroyo - ATK Mohun Bagan,42.0,3.0,10.0,9.0,0.071429,0.238095,0.214286,0.52381
36,Mehtab Singh - Mumbai City,31.0,0.0,7.0,9.0,0.0,0.225806,0.290323,0.516129
28,Juan Antonio González Fernández - Hyderabad,37.0,1.0,7.0,9.0,0.027027,0.189189,0.243243,0.459459


In [7]:
### Fullback Analysis

# Just filter the show all players who have made pressures while in a FB/WB
fb_analysis = pressures_analysis_position_df[pressures_analysis_position_df['pressing_player_position'].str.contains('Right Back|Left Back|Right Wing Back|Left Wing Back')].copy()
fb_analysis = fb_analysis.groupby(['pressing_player'])['total_pressures','successful_initial_pressure','successful_2second_pressure','successful_pass_pressure'].sum().reset_index()
fb_analysis['%_successful_initial'] = fb_analysis.successful_initial_pressure / fb_analysis.total_pressures
fb_analysis['%_successful_2second'] = fb_analysis.successful_2second_pressure / fb_analysis.total_pressures
fb_analysis['%_successful_pass'] = fb_analysis.successful_pass_pressure / fb_analysis.total_pressures
fb_analysis['%_successful_total'] = (fb_analysis.successful_initial_pressure + fb_analysis.successful_2second_pressure + fb_analysis.successful_pass_pressure) / fb_analysis.total_pressures

# Filter to just show the players at or above the median
fb_analysis[fb_analysis['total_pressures']>=fb_analysis.describe().loc['50%','total_pressures']].sort_values(by=['%_successful_total'],ascending=False).head()



Unnamed: 0,pressing_player,total_pressures,successful_initial_pressure,successful_2second_pressure,successful_pass_pressure,%_successful_initial,%_successful_2second,%_successful_pass,%_successful_total
65,Seriton Fernandes - Goa,35.0,9.0,2.0,11.0,0.257143,0.057143,0.314286,0.628571
67,Soraisham Sandeep Singh - Kerala Blasters,26.0,5.0,5.0,3.0,0.192308,0.192308,0.115385,0.5
10,Asish Rai - Hyderabad,62.0,8.0,13.0,10.0,0.129032,0.209677,0.16129,0.5
16,Harmanjot Singh Khabra - Kerala Blasters,34.0,6.0,7.0,4.0,0.176471,0.205882,0.117647,0.5
20,Jerry Lalrinzuala - Chennaiyin,63.0,10.0,8.0,12.0,0.15873,0.126984,0.190476,0.47619


In [8]:
### Central/Defensive Midfielder Analysis

# Just filter the show all players who have made pressures while in a CM/DM position
mid_analysis = pressures_analysis_position_df[pressures_analysis_position_df['pressing_player_position'].str.contains('Center Midfield|Defensive Midfield')].copy()
mid_analysis = mid_analysis.groupby(['pressing_player'])['total_pressures','successful_initial_pressure','successful_2second_pressure','successful_pass_pressure'].sum().reset_index()
mid_analysis['%_successful_initial'] = mid_analysis.successful_initial_pressure / mid_analysis.total_pressures
mid_analysis['%_successful_2second'] = mid_analysis.successful_2second_pressure / mid_analysis.total_pressures
mid_analysis['%_successful_pass'] = mid_analysis.successful_pass_pressure / mid_analysis.total_pressures
mid_analysis['%_successful_total'] = (mid_analysis.successful_initial_pressure + mid_analysis.successful_2second_pressure + mid_analysis.successful_pass_pressure) / mid_analysis.total_pressures

# Filter to just show the players at or above the median
mid_analysis[mid_analysis['total_pressures']>=mid_analysis.describe().loc['50%','total_pressures']].sort_values(by=['%_successful_total'],ascending=False).head()



Unnamed: 0,pressing_player,total_pressures,successful_initial_pressure,successful_2second_pressure,successful_pass_pressure,%_successful_initial,%_successful_2second,%_successful_pass,%_successful_total
36,Iman Basafa - Bengaluru,13.0,6.0,1.0,2.0,0.461538,0.076923,0.153846,0.692308
20,Danish Farooq Bhat - Bengaluru,35.0,3.0,4.0,13.0,0.085714,0.114286,0.371429,0.571429
91,Suresh Singh Wangjam - Bengaluru,40.0,6.0,4.0,12.0,0.15,0.1,0.3,0.55
26,Edwin Sydney Vanspaul - Chennaiyin,31.0,5.0,3.0,9.0,0.16129,0.096774,0.290323,0.548387
69,Mohammed Irshad - NorthEast United,34.0,6.0,1.0,11.0,0.176471,0.029412,0.323529,0.529412
