In [10]:
import requests
import json
import pandas as pd
import time
import sys
import numpy as np

In [9]:
def extract_data(text):
    """
    Extracts JSON data embedded within <script> tags in HTML.

    Args:
      text: The HTML content as a string.

    Returns:
      A Python object (list or dictionary) representing the extracted JSON data,
      or an empty dictionary if no data is found.
    """

    start_index = text.find("('") + 2
    end_index = text.find("')")
    json_data = text[start_index:end_index]
    json_data = json_data.encode('utf8').decode('unicode_escape')

    try:
        data = json.loads(json_data)
    except json.JSONDecodeError:
        data = {}
    return data

# Specific settings for EPL 2024/2025 season
years = [2024]  # Only the 2024/2025 season
leagues = ["EPL"] # Only the English Premier League

shots_data = []

# Iterate for EPL 24/25 season
for league_index, league in enumerate(leagues):
    for year_index, year in enumerate(years):
        url = f"https://understat.com/league/{league}/{year}"
        response = requests.get(url)

        data = extract_data(response.text)
        total_matches = len(data)

        for match_index, match in enumerate(data):
            # Check forecast to ensure fixture has happened (and therefore has shot data).
            forecast = match.get('forecast', {'w': None, 'd': None, 'l': None})
            if forecast['w'] is not None or forecast['d'] is not None or forecast['l'] is not None:
                match_id = match['id']
                url = f"https://understat.com/match/{match_id}"
                response = requests.get(url)

                shot_data_raw = extract_data(response.text)

                if shot_data_raw != {}:
                    # Process shot data for both teams.
                    for team in ['h', 'a']:
                        for shot in shot_data_raw[team]:
                            shots_data.append([
                                shot['id'],
                                shot['minute'],
                                shot['result'],
                                shot['X'],
                                shot['Y'],
                                shot['xG'],
                                shot['player'],
                                shot['h_a'],
                                shot['player_id'],
                                shot['situation'],
                                str(year) + "-" + str(year + 1),
                                shot['shotType'],
                                shot['match_id'],
                                shot['h_team'],
                                shot['a_team'],
                                shot['h_goals'],
                                shot['a_goals'],
                                shot['date'],
                                shot['player_assisted'],
                                shot['lastAction']
                            ])

                # Print progress on the same line
                print(f"League: {league_index+1}/{len(leagues)}, Year: {year_index+1}/{len(years)}, Match: {match_index+1}/{total_matches}  ", end="\r")
                sys.stdout.flush()

            # Adding sleep to respect understats' server load.
            time.sleep(2)
    # Adding sleep to respect understats' server load.
    time.sleep(2)

# Create Pandas DataFrame for shots data
shots_columns = ["shot_id", "minute", "result", "X", "Y", "understat_xG", "player", "h_a", "player_id", "situation",
                 "season", "shot_type", "match_id", "home_team", "away_team", "home_goals", "away_goals", "date",
                 "assisting_player", "last_action"]

df = pd.DataFrame(shots_data, columns=shots_columns)

print(" " * 50, end='\r') # Clear the progress line
print("Scraping complete!")
display(df)

Scraping complete!                                


Unnamed: 0,shot_id,minute,result,X,Y,understat_xG,player,h_a,player_id,situation,season,shot_type,match_id,home_team,away_team,home_goals,away_goals,date,assisting_player,last_action
0,584630,19,MissedShots,0.970999984741211,0.6730000305175782,0.050078392028808594,Bruno Fernandes,h,1228,OpenPlay,2024-2025,LeftFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Lisandro Martínez,Chipped
1,584631,26,MissedShots,0.7469999694824219,0.47900001525878905,0.020807035267353058,Casemiro,h,2248,OpenPlay,2024-2025,RightFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Mason Mount,Pass
2,584632,28,SavedShot,0.9130000305175782,0.34599998474121096,0.25769174098968506,Bruno Fernandes,h,1228,OpenPlay,2024-2025,RightFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Casemiro,Pass
3,584633,33,SavedShot,0.9130000305175782,0.46299999237060546,0.4701629877090454,Bruno Fernandes,h,1228,OpenPlay,2024-2025,RightFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Casemiro,Pass
4,584634,36,MissedShots,0.9580000305175781,0.5990000152587891,0.01805894263088703,Casemiro,h,2248,FromCorner,2024-2025,Head,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Amad Diallo Traore,Aerial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4514,608819,75,MissedShots,0.8069999694824219,0.49,0.2599138617515564,Luis Díaz,a,10408,OpenPlay,2024-2025,RightFoot,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Mohamed Salah,BallRecovery
4515,608820,79,MissedShots,0.9019999694824219,0.5690000152587891,0.07474000751972198,Dominik Szoboszlai,a,9788,OpenPlay,2024-2025,Head,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Trent Alexander-Arnold,Cross
4516,608822,83,SavedShot,0.8119999694824219,0.669000015258789,0.04806426912546158,Luis Díaz,a,10408,OpenPlay,2024-2025,RightFoot,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Diogo Jota,Pass
4517,608823,83,MissedShots,0.9069999694824219,0.515,0.06379381567239761,Diogo Jota,a,6854,OpenPlay,2024-2025,Head,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Trent Alexander-Arnold,Chipped


In [18]:
# Assuming a standard FIFA football pitch size for calculations
pitch_length = 105  # meters
pitch_width = 68   # meters
penalty_area_length = 16.5  # meters from the goal line
penalty_area_width = 40.32  # meters

# Normalized range for the x-axis (opponent's byline is 1)
penalty_area_start_x_normalized = 1 - (penalty_area_length / pitch_length)

# Normalized range for the y-axis
half_penalty_width = penalty_area_width / 2
penalty_area_lower_y_normalized = 0.5 - (half_penalty_width / pitch_width)
penalty_area_upper_y_normalized = 0.5 + (half_penalty_width / pitch_width)

# Function to classify shots (returns True if inside, False otherwise)
def is_inside_penalty_area(x, y):
    return (penalty_area_start_x_normalized <= x <= 1.0 and
            penalty_area_lower_y_normalized <= y <= penalty_area_upper_y_normalized)

# Convert 'X' and 'Y' columns to numeric (float)
df['X'] = pd.to_numeric(df['X'])
df['Y'] = pd.to_numeric(df['Y'])

# Apply the function to create the 'inside_box' column
df['inside_box'] = df.apply(lambda row: is_inside_penalty_area(row['X'], row['Y']), axis=1)

display(df)

Unnamed: 0,shot_id,minute,result,X,Y,understat_xG,player,h_a,player_id,situation,...,shot_type,match_id,home_team,away_team,home_goals,away_goals,date,assisting_player,last_action,inside_box
0,584630,19,MissedShots,0.971,0.673,0.050078392028808594,Bruno Fernandes,h,1228,OpenPlay,...,LeftFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Lisandro Martínez,Chipped,True
1,584631,26,MissedShots,0.747,0.479,0.020807035267353058,Casemiro,h,2248,OpenPlay,...,RightFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Mason Mount,Pass,False
2,584632,28,SavedShot,0.913,0.346,0.25769174098968506,Bruno Fernandes,h,1228,OpenPlay,...,RightFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Casemiro,Pass,True
3,584633,33,SavedShot,0.913,0.463,0.4701629877090454,Bruno Fernandes,h,1228,OpenPlay,...,RightFoot,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Casemiro,Pass,True
4,584634,36,MissedShots,0.958,0.599,0.01805894263088703,Casemiro,h,2248,FromCorner,...,Head,26602,Manchester United,Fulham,1,0,2024-08-16 19:00:00,Amad Diallo Traore,Aerial,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4514,608819,75,MissedShots,0.807,0.490,0.2599138617515564,Luis Díaz,a,10408,OpenPlay,...,RightFoot,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Mohamed Salah,BallRecovery,False
4515,608820,79,MissedShots,0.902,0.569,0.07474000751972198,Dominik Szoboszlai,a,9788,OpenPlay,...,Head,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Trent Alexander-Arnold,Cross,True
4516,608822,83,SavedShot,0.812,0.669,0.04806426912546158,Luis Díaz,a,10408,OpenPlay,...,RightFoot,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Diogo Jota,Pass,False
4517,608823,83,MissedShots,0.907,0.515,0.06379381567239761,Diogo Jota,a,6854,OpenPlay,...,Head,26768,Tottenham,Liverpool,3,6,2024-12-22 16:30:00,Trent Alexander-Arnold,Chipped,True


In [47]:
# Convert to numeric before aggregating
df['understat_xG'] = pd.to_numeric(df['understat_xG'], errors='coerce')

# Create separate DataFrames for shots inside and outside the box
in_box_df = df[df['inside_box'] == True]
out_box_df = df[df['inside_box'] == False]

# Aggregate data for shots inside the box
in_box_agg = in_box_df.groupby('player').agg(
    shots_in_box=('inside_box', 'count'),
    xg_in_box=('understat_xG', 'sum')
).reset_index()

# Aggregate data for shots outside the box
out_box_agg = out_box_df.groupby('player').agg(
    shots_out_box=('inside_box', 'count'),
    xg_out_box=('understat_xG', 'sum')
).reset_index()

# Merge the aggregated DataFrames
merged_df = pd.merge(in_box_agg, out_box_agg, on='player', how='outer')

# Round xG values to 2 decimal places
merged_df['xg_in_box'] = merged_df['xg_in_box'].round(2)
merged_df['xg_out_box'] = merged_df['xg_out_box'].round(2)

# Fill NaN values for shot counts with 0
merged_df['shots_in_box'] = merged_df['shots_in_box'].fillna(0).astype(int)
merged_df['shots_out_box'] = merged_df['shots_out_box'].fillna(0).astype(int)

# Convert xG columns to numeric before filling NaN values
merged_df['xg_in_box'] = pd.to_numeric(merged_df['xg_in_box'], errors='coerce')
merged_df['xg_out_box'] = pd.to_numeric(merged_df['xg_out_box'], errors='coerce')

# Now fill NaN values
merged_df['xg_in_box'] = merged_df['xg_in_box'].fillna(0)
merged_df['xg_out_box'] = merged_df['xg_out_box'].fillna(0)

# Calculate total xG
merged_df['total_xg'] = merged_df['xg_in_box'] + merged_df['xg_out_box']
merged_df['total_shots'] = merged_df['shots_in_box'] + merged_df['shots_out_box']

# merged_df['%_xg_outside_box'] = (merged_df['xg_out_box'] / (merged_df['xg_in_box'] + merged_df['xg_out_box']) * 100).fillna(0)

# Print the resulting DataFrame
display(merged_df)

Unnamed: 0,player,shots_in_box,xg_in_box,shots_out_box,xg_out_box,total_xg,total_shots
0,Aaron Wan-Bissaka,4,0.52,3,0.09,0.61,7
1,Abdoulaye Doucouré,5,0.53,2,0.03,0.56,7
2,Abdul Fatawu,4,0.31,9,0.16,0.47,13
3,Adam Armstrong,17,3.53,4,0.11,3.64,21
4,Adam Lallana,1,0.07,2,0.08,0.15,3
...,...,...,...,...,...,...,...
379,Sasa Lukic,0,0.00,2,0.06,0.06,2
380,Tariq Lamptey,0,0.00,2,0.08,0.08,2
381,Valentino Livramento,0,0.00,1,0.02,0.02,1
382,Victor Kristiansen,0,0.00,1,0.00,0.00,1


In [48]:
file_path = input('Specify File Path:')
merged_df.to_csv(f"{file_path}/understat_shot_summary.csv", index=False)