In [1]:
import requests
import json
import pandas as pd
import os

In [34]:
season_names = ['2014', '2015', '2016', '2017', '2018', '2019']
gws = ['3', '5', '10']
leagues = ['EPL', 'La_liga', 'Bundesliga', 'Serie_A', 'Ligue_1']

In [35]:
def scrape_understat(payload):
    #Build request using url, headers (mimicking what Firefox does normally)
    #Works best with verify=True as you won't get the ssl errors. Payload is 
    #taylored for each request
    url = 'https://understat.com/main/getPlayersStats/'
    headers = {'content-type':'application/json; charset=utf-8',
    'Host': 'understat.com',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '39',
    'Origin': 'https: // understat.com',
    'Connection': 'keep - alive',
    'Referer': 'https: // understat.com / league / EPL'
    }
    response = requests.post(url, data=payload, headers = headers, verify=True)
    response_json = response.json()
    inner_wrapper = response_json['response']
    json_player_data = inner_wrapper['players']
    return json_player_data

In [36]:
def clean_df(player_df, weeks):
    # Get rid of the columns that we don't care about
    #player_df.drop(['yellow_cards','red_cards', 'xGChain','xGBuildup','games','time'], axis=1, inplace=True)
    player_df  = player_df.rename(columns={'goals':'goals_'+weeks,'xG':'xG_'+weeks,'assists':'assists_'+weeks, 'xA':'xA_'+weeks, 'shots':'shots_'+weeks, 'key_passes':
        'key_passes_'+weeks,'npg':'npg_'+weeks,'npxG':'npxG_'+weeks})
    
    return(player_df)

In [49]:
def gw_data(season , league,  no_of_gw):
#     Create Pandas dataframes from each html table
    print('Getting data for last {} matches'.format(no_of_gw))
    json_player_data = scrape_understat({'league':'EPL', 'season':season, 'n_last_matches': no_of_gw})
    gw_table = pd.DataFrame(json_player_data)
    gw_df = clean_df(gw_table,'3wks')
    #Replace Position indentifiers with something more useful
    gw_df['position'] = gw_df['position'].str.slice(0,1)
    position_map = {'D':'DEF', 'F':'FWD', 'M':'MID', 'G':'GK', 'S':'FWD'}
    gw_df = gw_df.replace({'position': position_map})
    gw_df.to_csv(r'C:\Users\Asus\PycharmProjects\Understat Data Scraper\Data\Player_Data\gw_data\last_{}_gw_data.csv'.format(no_of_gw), encoding='utf-8', index=False)
    print('last {} matches csv data written'.format(no_of_gw))
    return gw_df

In [50]:
last_3_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[0])
last_5_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[1])
last_10_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[2])

Getting data for last 3 matches
last 3 matches csv data written
Getting data for last 5 matches
last 5 matches csv data written
Getting data for last 10 matches
last 10 matches csv data written


In [47]:
def season_data(season, league):
    print('Getting data for {} season'.format(season))
    json_player_data = scrape_understat({'league': league, 'season':season})
    season_table = pd.DataFrame(json_player_data)
    season_df = clean_df(season_table, 'season')
    season_df.to_csv(r'C:\Users\Asus\PycharmProjects\Understat Data Scraper\Data\Player_Data\season_data\{}_whole_season_data.csv'.format(season), encoding='utf-8', index=False)
    print('csv file for {} season written'.format(season))
    return season_df

In [48]:
season_1415 = season_data(season_names[0], leagues[0])
season_1516 = season_data(season_names[1], leagues[0])
season_1617 = season_data(season_names[2], leagues[0])
season_1718 = season_data(season_names[3], leagues[0])
season_1819 = season_data(season_names[4], leagues[0])
season_1920 = season_data(season_names[5], leagues[0])

Getting data for 2014 season
csv file for 2014 season written
Getting data for 2015 season
csv file for 2015 season written
Getting data for 2016 season
csv file for 2016 season written
Getting data for 2017 season
csv file for 2017 season written
Getting data for 2018 season
csv file for 2018 season written
Getting data for 2019 season
csv file for 2019 season written


In [11]:
season_df

Unnamed: 0,id,player_name,games,time,goals_season,xG_season,assists_season,xA_season,shots_season,key_passes_season,yellow_cards,red_cards,npg_season,npxG_season,xGChain,xGBuildup
0,755,Jamie Vardy,35,3034,23,18.903537318110466,5,6.3682975601404905,89,32,3,0,19,15.097693115472794,21.02660731226206,1.7243406660854816
1,318,Pierre-Emerick Aubameyang,36,3143,22,16.352623080834746,3,4.492486916482449,93,26,3,1,20,14.830358987674117,19.964282035827637,5.339657470583916
2,986,Danny Ings,38,2836,22,15.659717170521617,2,2.8490850934758782,93,35,3,0,21,14.137379484251142,18.48803149908781,5.015938125550747
3,618,Raheem Sterling,33,2678,20,19.799906481057405,1,7.208586284890771,100,48,5,0,20,18.277568746358156,31.4420103430748,10.185997404158115
4,1250,Mohamed Salah,34,2904,19,20.66331870108843,10,8.72604252398014,132,60,1,0,16,18.379812128841877,31.37419793009758,8.42502685263753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,8493,Jake Vokins,1,90,0,0.013284161686897278,0,0.09659572690725327,1,1,0,0,0,0.013284161686897278,0.5018613934516907,0.4885772168636322
511,8496,Tommy Doyle,1,13,0,0.07760051637887955,0,0,1,0,0,0,0,0.07760051637887955,0.07760051637887955,0
512,8518,Jordan Thomas,1,1,0,0,0,0,0,0,0,0,0,0,0,0
513,8562,Luke Thomas,3,270,0,0,1,0.6299818009138107,0,9,0,0,0,0,1.3788530230522156,1.2677359282970428


In [None]:
season_1415

In [None]:
season_1516

In [None]:
season_1617

In [None]:
season_1718

In [None]:
season_1819

In [None]:
season_1920