Understat Data for Teams-Players EPL (2014-2019)

In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
import pandas as pd
import requests
import os

def scrape_understat(payload):
    #Build request using url, headers (mimicking what Firefox does normally)
    #Works best with verify=True as you won't get the ssl errors. Payload is 
    #taylored for each request
    url = 'https://understat.com/main/getPlayersStats/'
    headers = {'content-type':'application/json; charset=utf-8',
    'Host': 'understat.com',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '39',
    'Origin': 'https: // understat.com',
    'Connection': 'keep - alive',
    'Referer': 'https: // understat.com / league / EPL'
    }
    response = requests.post(url, data=payload, headers = headers, verify=True)
    response_json = response.json()
    inner_wrapper = response_json['response']
    json_player_data = inner_wrapper['players']
    return json_player_data

def clean_df(player_df, weeks):
    # Get rid of the columns that we don't care about
    #player_df.drop(['yellow_cards','red_cards', 'xGChain','xGBuildup','games','time'], axis=1, inplace=True)
    player_df  = player_df.rename(columns={'goals':'goals_'+weeks,'xG':'xG_'+weeks,'assists':'assists_'+weeks, 'xA':'xA_'+weeks, 'shots':'shots_'+weeks, 'key_passes':
        'key_passes_'+weeks,'npg':'npg_'+weeks,'npxG':'npxG_'+weeks})
    if weeks != '3wks':
        player_df.drop(['position','team_title'], axis=1, inplace=True)
    return(player_df)

#Create Pandas dataframes from each html table
print('Getting data for last 3 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '3'})
three_game_table = pd.DataFrame(json_player_data)
three_game_df = clean_df(three_game_table,'3wks')
#Replace Position indentifiers with something more useful
three_game_df['position'] = three_game_df['position'].str.slice(0,1)
position_map = {'D':'DEF', 'F':'FWD', 'M':'MID', 'G':'GK', 'S':'FWD'}
three_game_df = three_game_df.replace({'position': position_map})

print('Getting data for last 5 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '5'})
five_game_table = pd.DataFrame(json_player_data)
five_game_df = clean_df(five_game_table, '5wks')

print('Getting data for last 10 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '10'})
ten_game_table = pd.DataFrame(json_player_data)
ten_game_df = clean_df(ten_game_table, '10wks')

print('Getting data for the whole season')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019'})
season_table = pd.DataFrame(json_player_data)
season_df = clean_df(season_table, 'season')

print('Merging Tables')
EPL_player_df = pd.merge(three_game_df, five_game_df, on=['id','player_name'])
EPL_player_df = pd.merge(EPL_player_df, ten_game_df, on=['id','player_name'])
# EPL_player_df = pd.merge(EPL_player_df, season_df, on=['id','player_name'])

# print('Writing CSV File')
# EPL_player_df.to_csv('Understat_EPL_Player_Data_Combined.csv', encoding='utf-8', index=False)

Getting data for last 3 matches
Getting data for last 5 matches
Getting data for last 10 matches
Getting data for the whole season
Merging Tables


In [3]:
three_game_df

Unnamed: 0,id,player_name,games,time,goals_3wks,xG_3wks,assists_3wks,xA_3wks,shots_3wks,key_passes_3wks,yellow_cards,red_cards,position,team_title,npg_3wks,npxG_3wks,xGChain,xGBuildup
0,647,Harry Kane,3,270,5,2.0607955306768417,0,0.3910949155688286,7,2,1,0,FWD,Tottenham,5,2.0607955306768417,2.6767265051603317,0.22483597695827484
1,502,Olivier Giroud,3,234,3,3.1137402653694153,0,0.40248509496450424,12,2,1,0,FWD,Chelsea,3,3.1137402653694153,3.732365131378174,0.30892330408096313
2,618,Raheem Sterling,3,198,3,2.774513840675354,0,0.39271488785743713,11,3,0,0,FWD,Manchester City,3,2.013344943523407,2.352346047759056,0.712441697716713
3,986,Danny Ings,3,270,3,2.290885627269745,0,0.13838174007833004,10,3,0,0,FWD,Southampton,2,0.7685479409992695,1.814413771033287,1.169490970671177
4,4456,Chris Wood,3,264,3,2.1954740285873413,0,0.7095881886780262,10,4,0,0,FWD,Burnley,2,1.4343052208423615,1.5645041763782501,0.15562047436833382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,8493,Jake Vokins,1,90,0,0.013284161686897278,0,0.09659572690725327,1,1,0,0,DEF,Southampton,0,0.013284161686897278,0.5018613934516907,0.4885772168636322
511,8496,Tommy Doyle,1,13,0,0.07760051637887955,0,0,1,0,0,0,FWD,Manchester City,0,0.07760051637887955,0.07760051637887955,0
512,8518,Jordan Thomas,1,1,0,0,0,0,0,0,0,0,FWD,Norwich,0,0,0,0
513,8562,Luke Thomas,3,270,0,0,1,0.6299818009138107,0,9,0,0,DEF,Leicester,0,0,1.3788530230522156,1.2677359282970428
