Understat Data for Teams-Players EPL (2014-2019)

In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
import pandas as pd
import requests
import os

def scrape_understat(payload):
    #Build request using url, headers (mimicking what Firefox does normally)
    #Works best with verify=True as you won't get the ssl errors. Payload is 
    #taylored for each request
    url = 'https://understat.com/main/getPlayersStats/'
    headers = {'content-type':'application/json; charset=utf-8',
    'Host': 'understat.com',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '39',
    'Origin': 'https: // understat.com',
    'Connection': 'keep - alive',
    'Referer': 'https: // understat.com / league / EPL'
    }
    response = requests.post(url, data=payload, headers = headers, verify=True)
    response_json = response.json()
    inner_wrapper = response_json['response']
    json_player_data = inner_wrapper['players']
    return json_player_data

def clean_df(player_df, weeks):
    # Get rid of the columns that we don't care about
    #player_df.drop(['yellow_cards','red_cards', 'xGChain','xGBuildup','games','time'], axis=1, inplace=True)
    player_df  = player_df.rename(columns={'goals':'goals_'+weeks,'xG':'xG_'+weeks,'assists':'assists_'+weeks, 'xA':'xA_'+weeks, 'shots':'shots_'+weeks, 'key_passes':
        'key_passes_'+weeks,'npg':'npg_'+weeks,'npxG':'npxG_'+weeks})
    if weeks != '3wks':
        player_df.drop(['position','team_title'], axis=1, inplace=True)
    return(player_df)

#Create Pandas dataframes from each html table
print('Getting data for last 3 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '3'})
three_game_table = pd.DataFrame(json_player_data)
three_game_df = clean_df(three_game_table,'3wks')
#Replace Position indentifiers with something more useful
three_game_df['position'] = three_game_df['position'].str.slice(0,1)
position_map = {'D':'DEF', 'F':'FWD', 'M':'MID', 'G':'GK', 'S':'FWD'}
three_game_df = three_game_df.replace({'position': position_map})

print('Getting data for last 5 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '5'})
five_game_table = pd.DataFrame(json_player_data)
five_game_df = clean_df(five_game_table, '5wks')

print('Getting data for last 10 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '10'})
ten_game_table = pd.DataFrame(json_player_data)
ten_game_df = clean_df(ten_game_table, '10wks')

print('Getting data for the whole season')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019'})
season_table = pd.DataFrame(json_player_data)
season_df = clean_df(season_table, 'season')

print('Merging Tables')
EPL_player_df = pd.merge(three_game_df, five_game_df, on=['id','player_name'])
EPL_player_df = pd.merge(EPL_player_df, ten_game_df, on=['id','player_name'])
# EPL_player_df = pd.merge(EPL_player_df, season_df, on=['id','player_name'])

# print('Writing CSV File')
# EPL_player_df.to_csv('Understat_EPL_Player_Data_Combined.csv', encoding='utf-8', index=False)

Getting data for last 3 matches
Getting data for last 5 matches
Getting data for last 10 matches
Getting data for the whole season
Merging Tables


In [3]:
three_game_df

Unnamed: 0,id,player_name,games,time,goals_3wks,xG_3wks,assists_3wks,xA_3wks,shots_3wks,key_passes_3wks,yellow_cards,red_cards,position,team_title,npg_3wks,npxG_3wks,xGChain,xGBuildup
0,647,Harry Kane,3,270,5,2.0607955306768417,0,0.3910949155688286,7,2,1,0,FWD,Tottenham,5,2.0607955306768417,2.6767265051603317,0.22483597695827484
1,502,Olivier Giroud,3,234,3,3.1137402653694153,0,0.40248509496450424,12,2,1,0,FWD,Chelsea,3,3.1137402653694153,3.732365131378174,0.30892330408096313
2,618,Raheem Sterling,3,198,3,2.774513840675354,0,0.39271488785743713,11,3,0,0,FWD,Manchester City,3,2.013344943523407,2.352346047759056,0.712441697716713
3,986,Danny Ings,3,270,3,2.290885627269745,0,0.13838174007833004,10,3,0,0,FWD,Southampton,2,0.7685479409992695,1.814413771033287,1.169490970671177
4,4456,Chris Wood,3,264,3,2.1954740285873413,0,0.7095881886780262,10,4,0,0,FWD,Burnley,2,1.4343052208423615,1.5645041763782501,0.15562047436833382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,8493,Jake Vokins,1,90,0,0.013284161686897278,0,0.09659572690725327,1,1,0,0,DEF,Southampton,0,0.013284161686897278,0.5018613934516907,0.4885772168636322
511,8496,Tommy Doyle,1,13,0,0.07760051637887955,0,0,1,0,0,0,FWD,Manchester City,0,0.07760051637887955,0.07760051637887955,0
512,8518,Jordan Thomas,1,1,0,0,0,0,0,0,0,0,FWD,Norwich,0,0,0,0
513,8562,Luke Thomas,3,270,0,0,1,0.6299818009138107,0,9,0,0,DEF,Leicester,0,0,1.3788530230522156,1.2677359282970428


In [5]:
outname_3wks = 'last_3_gw-data.csv'
outdir_3wks = r'E:\\AMotefaker\\ABC\\FPA\\Web Scraping\\Understat Data for Teams-Players (2014- present)\\Understat Data Scraper\\Player_Data\\last_3_5_10_gw_data\\last_3_gw_data.csv'
if not os.path.exists(outdir_3wks):
    os.mkdir(outdir_3wks)
fullname_3wks = os.path.join(outname_3wks, outdir_3wks) 
three_game_df.to_csv('{}'.format(fullname_3wks),encoding = 'utf-8', index = False)

PermissionError: [Errno 13] Permission denied: 'E:\\\\AMotefaker\\\\ABC\\\\FPA\\\\Web Scraping\\\\Understat Data for Teams-Players (2014- present)\\\\Understat Data Scraper\\\\Player_Data\\\\last_3_5_10_gw_data\\\\last_3_gw_data.csv'

In [6]:
EPL_player_df[0:17]

Unnamed: 0,id,player_name,games_x,time_x,goals_3wks,xG_3wks,assists_3wks,xA_3wks,shots_3wks,key_passes_3wks,...,assists_10wks,xA_10wks,shots_10wks,key_passes_10wks,yellow_cards,red_cards,npg_10wks,npxG_10wks,xGChain,xGBuildup
0,647,Harry Kane,3,270,5,2.0607955306768417,0,0.3910949155688286,7,2,...,0,1.297498844563961,28,9,2,0,7,5.564089119434357,7.057452630251646,0.624919380992651
1,502,Olivier Giroud,3,234,3,3.1137402653694157,0,0.4024850949645042,12,2,...,0,0.5414802059531212,29,3,1,0,7,5.79319491237402,6.430627778172493,1.374771699309349
2,618,Raheem Sterling,3,198,3,2.774513840675354,0,0.3927148878574371,11,3,...,0,1.328054966405034,31,13,0,0,9,5.7317270040512085,7.737957417964935,2.165522690862417
3,986,Danny Ings,3,270,3,2.290885627269745,0,0.13838174007833,10,3,...,1,1.0314845945686102,26,14,1,0,6,3.1079669073224068,6.144841525703669,2.623978223651648
4,4456,Chris Wood,3,264,3,2.1954740285873413,0,0.7095881886780262,10,4,...,0,0.8576705902814865,21,10,0,0,3,3.677421435713768,3.3569836765527725,0.3470211364328861
5,7700,Che Adams,3,151,3,1.9055250883102417,0,0.4125483632087707,10,1,...,1,0.8078726157546043,21,4,0,0,4,3.1744526624679565,4.17007052898407,0.4764880314469337
6,318,Pierre-Emerick Aubameyang,3,210,2,1.455847904086113,1,0.3655194416642189,6,2,...,2,1.2012219205498695,28,6,0,0,4,3.5896055176854134,4.833086274564266,0.9512820690870284
7,447,Kevin De Bruyne,3,246,2,0.4195265993475914,2,2.774835407733917,7,21,...,4,5.972339313477278,26,41,1,0,3,1.4963189624249935,9.413695573806764,3.9182050600647926
8,531,Michail Antonio,3,219,2,1.9048762023448944,0,0.0841497406363487,7,2,...,1,1.8581748697906733,36,13,3,0,7,8.16870960779488,8.764186352491379,1.3592907842248678
9,574,Troy Deeney,3,229,2,1.6963218748569489,0,0.4521321356296539,5,2,...,0,0.9918195381760596,14,10,0,0,1,1.6338527463376522,2.9426354561001062,1.566445479169488


In [7]:
EPL_player_df.columns

Index(['id', 'player_name', 'games_x', 'time_x', 'goals_3wks', 'xG_3wks',
       'assists_3wks', 'xA_3wks', 'shots_3wks', 'key_passes_3wks',
       'yellow_cards_x', 'red_cards_x', 'position', 'team_title', 'npg_3wks',
       'npxG_3wks', 'xGChain_x', 'xGBuildup_x', 'games_y', 'time_y',
       'goals_5wks', 'xG_5wks', 'assists_5wks', 'xA_5wks', 'shots_5wks',
       'key_passes_5wks', 'yellow_cards_y', 'red_cards_y', 'npg_5wks',
       'npxG_5wks', 'xGChain_y', 'xGBuildup_y', 'games', 'time', 'goals_10wks',
       'xG_10wks', 'assists_10wks', 'xA_10wks', 'shots_10wks',
       'key_passes_10wks', 'yellow_cards', 'red_cards', 'npg_10wks',
       'npxG_10wks', 'xGChain', 'xGBuildup'],
      dtype='object')

In [8]:
EPL_player_df.rename(columns = {'games_x' : 'games_in_3wks',
                                'time_x' :'games_in_3wks',
                                'yellow_cards_x': 'yellow_cards_3wks',
                                'red_cards_x' : 'red_cards_3wks',
                                'xGChain_x': 'xGChain_3wks',
                                'xGBuildup_x': 'xGBuildup_3wks', 
                                'games_y' : 'games_season', 
                                'time_y' : 'time_season',
                                'yellow_cards_y': 'yellow_cards_season',
                                'red_cards_y' : 'red_cards_season',
                                'xGChain_y': 'xGChain_season',
                                'xGBuildup_y': 'xGBuildup_season',
                                'games_x' : 'games_in_3wks',
                                'time_x' :'games_in_3wks',
                                'yellow_cards_x': 'yellow_cards_3wks',
                                'red_cards_x' : 'red_cards_3wks',
                                'xGChain_x': 'xGChain_3wks',
                                'xGBuildup_x': 'xGBuildup_3wks', 
                                'games_y' : 'games_season', 
                                'time_y' : 'time_season',
                                'yellow_cards_y': 'yellow_cards_season',
                                'red_cards_y' : 'red_cards_season',
                                'xGChain_y': 'xGChain_season',
                                'xGBuildup_y': 'xGBuildup_season'})

Unnamed: 0,id,player_name,games_in_3wks,games_in_3wks.1,goals_3wks,xG_3wks,assists_3wks,xA_3wks,shots_3wks,key_passes_3wks,...,assists_10wks,xA_10wks,shots_10wks,key_passes_10wks,yellow_cards,red_cards,npg_10wks,npxG_10wks,xGChain,xGBuildup
0,647,Harry Kane,3,270,5,2.0607955306768417,0,0.3910949155688286,7,2,...,0,1.297498844563961,28,9,2,0,7,5.564089119434357,7.057452630251646,0.624919380992651
1,502,Olivier Giroud,3,234,3,3.1137402653694153,0,0.40248509496450424,12,2,...,0,0.5414802059531212,29,3,1,0,7,5.79319491237402,6.430627778172493,1.374771699309349
2,618,Raheem Sterling,3,198,3,2.774513840675354,0,0.39271488785743713,11,3,...,0,1.328054966405034,31,13,0,0,9,5.7317270040512085,7.737957417964935,2.165522690862417
3,986,Danny Ings,3,270,3,2.290885627269745,0,0.13838174007833004,10,3,...,1,1.0314845945686102,26,14,1,0,6,3.1079669073224068,6.144841525703669,2.6239782236516476
4,4456,Chris Wood,3,264,3,2.1954740285873413,0,0.7095881886780262,10,4,...,0,0.8576705902814865,21,10,0,0,3,3.677421435713768,3.3569836765527725,0.3470211364328861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,8493,Jake Vokins,1,90,0,0.013284161686897278,0,0.09659572690725327,1,1,...,0,0.09659572690725327,1,1,0,0,0,0.013284161686897278,0.5018613934516907,0.4885772168636322
511,8496,Tommy Doyle,1,13,0,0.07760051637887955,0,0,1,0,...,0,0,1,0,0,0,0,0.07760051637887955,0.07760051637887955,0
512,8518,Jordan Thomas,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
513,8562,Luke Thomas,3,270,0,0,1,0.6299818009138107,0,9,...,1,0.6299818009138107,0,9,0,0,0,0,1.3788530230522156,1.2677359282970428


In [9]:
import requests
from bs4 import BeautifulSoup 
import json
import pandas as pd

In [10]:
base_url = 'https://understat.com/league/EPL/'
season_names = ['2014', '2015', '2016', '2017', '2018','2019', '2020']

Getting data for EPL 2014

In [11]:
url = base_url + season_names[0]
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')

In [12]:
# data is under "scripts" tag. So, finding all the script tags
script = soup.find_all('script')
# print(script)

In [13]:
string_with_json_obj = ''

for el in script:
    if 'teamsData' in el.text:
        string_with_json_obj = el.text.strip()
# print(string_with_json_obj)

In [14]:
#keeping only the part of string we need
ind_start = string_with_json_obj.index("('") + 2
ind_end = string_with_json_obj.index("')") 
json_data = string_with_json_obj[ind_start: ind_end]

json_data = json_data.encode('utf8').decode('unicode_escape')

In [16]:
print(json.dumps(data, indent=4, sort_keys=True))

NameError: name 'data' is not defined

In [17]:
data = json.loads(json_data)
print(data.keys())
# print('-' * 100)
# print(data['71'].keys())
# print('-' * 100)
# print(data['71']['id'])
# print('-' * 100)
# print(data['71']['title'])
# print('-' * 100)
# print(data['71']['history'])

dict_keys(['71', '72', '74', '75', '76', '77', '78', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '91', '92', '202'])


Making a dictionary composed opf team ID and team titles

In [18]:
teams ={}

for id in data.keys():
    teams[id] = data[id]['title']

In [19]:
teams

{'71': 'Aston Villa',
 '72': 'Everton',
 '74': 'Southampton',
 '75': 'Leicester',
 '76': 'West Bromwich Albion',
 '77': 'Sunderland',
 '78': 'Crystal Palace',
 '80': 'Chelsea',
 '81': 'West Ham',
 '82': 'Tottenham',
 '83': 'Arsenal',
 '84': 'Swansea',
 '85': 'Stoke',
 '86': 'Newcastle United',
 '87': 'Liverpool',
 '88': 'Manchester City',
 '89': 'Manchester United',
 '91': 'Hull',
 '92': 'Burnley',
 '202': 'Queens Park Rangers'}