In [3]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

In [23]:
import pandas as pd
import requests
import os

def scrape_understat(payload):
    #Build request using url, headers (mimicking what Firefox does normally)
    #Works best with verify=True as you won't get the ssl errors. Payload is 
    #taylored for each request
    url = 'https://understat.com/main/getPlayersStats/'
    headers = {'content-type':'application/json; charset=utf-8',
    'Host': 'understat.com',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '39',
    'Origin': 'https: // understat.com',
    'Connection': 'keep - alive',
    'Referer': 'https: // understat.com / league / EPL'
    }
    response = requests.post(url, data=payload, headers = headers, verify=True)
    response_json = response.json()
    inner_wrapper = response_json['response']
    json_player_data = inner_wrapper['players']
    return json_player_data

def clean_df(player_df, weeks):
    # Get rid of the columns that we don't care about
    #player_df.drop(['yellow_cards','red_cards', 'xGChain','xGBuildup','games','time'], axis=1, inplace=True)
    player_df  = player_df.rename(columns={'goals':'goals_'+weeks,'xG':'xG_'+weeks,'assists':'assists_'+weeks, 'xA':'xA_'+weeks, 'shots':'shots_'+weeks, 'key_passes':
        'key_passes_'+weeks,'npg':'npg_'+weeks,'npxG':'npxG_'+weeks})
    if weeks != '3wks':
        player_df.drop(['position','team_title'], axis=1, inplace=True)
    return(player_df)

#Create Pandas dataframes from each html table
print('Getting data for last 3 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '3'})
three_game_table = pd.DataFrame(json_player_data)
three_game_df = clean_df(three_game_table,'3wks')
#Replace Position indentifiers with something more useful
three_game_df['position'] = three_game_df['position'].str.slice(0,1)
position_map = {'D':'DEF', 'F':'FWD', 'M':'MID', 'G':'GK', 'S':'FWD'}
three_game_df = three_game_df.replace({'position': position_map})

print('Getting data for last 5 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '5'})
five_game_table = pd.DataFrame(json_player_data)
five_game_df = clean_df(five_game_table, '5wks')

print('Getting data for last 10 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019', 'n_last_matches': '10'})
ten_game_table = pd.DataFrame(json_player_data)
ten_game_df = clean_df(ten_game_table, '10wks')

print('Getting data for the whole season')
json_player_data = scrape_understat({'league':'EPL', 'season':'2019'})
season_table = pd.DataFrame(json_player_data)
season_df = clean_df(season_table, 'season')

print('Merging Tables')
EPL_player_df = pd.merge(three_game_df, five_game_df, on=['id','player_name'])
EPL_player_df = pd.merge(EPL_player_df, ten_game_df, on=['id','player_name'])
# EPL_player_df = pd.merge(EPL_player_df, season_df, on=['id','player_name'])

# print('Writing CSV File')
# EPL_player_df.to_csv('Understat_EPL_Player_Data_Combined.csv', encoding='utf-8', index=False)

Getting data for last 3 matches
Getting data for last 5 matches
Getting data for last 10 matches
Getting data for the whole season
Merging Tables


In [24]:
three_game_df

Unnamed: 0,id,player_name,games,time,goals_3wks,xG_3wks,assists_3wks,xA_3wks,shots_3wks,key_passes_3wks,yellow_cards,red_cards,position,team_title,npg_3wks,npxG_3wks,xGChain,xGBuildup
0,531,Michail Antonio,3,251,6,4.472343921661377,0,0.6665154322981834,15,5,0,0,FWD,West Ham,5,3.7111750841140747,3.5369811803102493,0.1282384842634201
1,618,Raheem Sterling,3,198,5,3.111586093902588,0,0.23588834702968597,11,3,0,0,FWD,Manchester City,5,2.350417196750641,2.476261720061302,1.1686157137155533
2,647,Harry Kane,3,270,4,2.6586690545082092,0,1.0790136978030205,11,6,1,0,FWD,Tottenham,4,2.6586690545082092,3.962518811225891,0.35916753113269806
3,574,Troy Deeney,3,227,3,2.3645221292972565,0,0.33117732405662537,5,4,0,0,FWD,Watford,1,0.8421844840049744,1.481808215379715,0.3891264498233795
4,7722,Trézéguet,3,246,3,1.0186635255813599,0,0.021080465987324715,6,1,0,0,FWD,Aston Villa,3,1.0186635255813599,0.7609298843890429,0.26407185941934586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,8480,Max Thompson,1,1,0,0,0,0,0,0,0,0,FWD,Burnley,0,0,0,0
507,8493,Jake Vokins,1,90,0,0.013284161686897278,0,0.09659572690725327,1,1,0,0,DEF,Southampton,0,0.013284161686897278,0.5018613934516907,0.4885772168636322
508,8496,Tommy Doyle,1,13,0,0.07760051637887955,0,0,1,0,0,0,FWD,Manchester City,0,0.07760051637887955,0.07760051637887955,0
509,8562,Luke Thomas,2,180,0,0,1,0.43036194145679474,0,6,0,0,MID,Leicester,0,0,0.9987059235572815,0.9753741323947906


In [30]:
outname_3wks = 'last_3_gw-data.csv'
outdir_3wks = r'C:\Users\Asus\PycharmProjects\Understat Data Scraper\Data\Player_Data\last_3_5_10_gw_data\last_3_gw_data.csv'
if not os.path.exists(outdir_3wks):
    os.mkdir(outdir_3wks)
fullname_3wks = os.path.join(outname_3wks, outdir_3wks) 
three_game_df.to_csv('{}'.format(fullname_3wks),encoding = 'utf-8', index = False)


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\Asus\\PycharmProjects\\Understat Data Scraper\\Data\\Player_Data\\last_3_5_10_gw_data\\last_3_gw_data.csv'

In [11]:
EPL_player_df[0:17]

Unnamed: 0,id,player_name,games_x,time_x,goals_3wks,xG_3wks,assists_3wks,xA_3wks,shots_3wks,key_passes_3wks,...,assists_10wks,xA_10wks,shots_10wks,key_passes_10wks,yellow_cards_y,red_cards_y,npg_10wks,npxG_10wks,xGChain_y,xGBuildup_y
0,531,Michail Antonio,3,251,6,4.472343921661377,0,0.6665154322981834,15,5,...,2,2.722211310639977,37,17,2,0,8,8.742039801552892,9.60639715194702,1.3592907842248678
1,618,Raheem Sterling,3,198,5,3.111586093902588,0,0.2358883470296859,11,3,...,0,1.655531318858266,27,12,1,0,8,5.240512847900391,7.468989789485931,1.941905077546835
2,647,Harry Kane,3,270,4,2.658669054508209,0,1.0790136978030205,11,6,...,0,1.4226757511496544,29,11,2,0,6,5.78854401409626,7.4526952765882015,0.6705302335321903
3,574,Troy Deeney,3,227,3,2.3645221292972565,0,0.3311773240566253,5,4,...,1,0.9094453528523444,13,10,0,0,2,2.135144915431738,3.094690004363656,0.7561060581356287
4,7722,Trézéguet,3,246,3,1.01866352558136,0,0.0210804659873247,6,1,...,0,0.1315535232424736,12,4,0,0,3,2.032090447843075,1.957192497327924,0.4288085401058197
5,453,Son Heung-Min,3,270,2,1.170886129140854,1,0.1937636919319629,7,4,...,3,1.9367764107882977,25,13,0,0,5,3.444611646234989,5.047011069953442,1.2185908071696758
6,502,Olivier Giroud,3,173,2,2.638754718005657,0,0.3357481062412262,12,1,...,0,0.4747432172298431,30,2,0,0,6,5.958827875554562,7.002514198422432,1.847762182354927
7,553,Anthony Martial,3,270,2,1.0962765514850616,2,1.3805373013019562,8,7,...,3,2.0685251131653786,25,10,0,0,7,4.731702536344528,7.503289744257927,1.5380779094994068
8,556,Marcus Rashford,3,267,2,1.576736679300666,1,0.4303758144378662,7,1,...,3,2.216541536152363,25,8,1,0,3,4.44674458168447,8.06920026242733,2.850165858864784
9,617,David Silva,3,204,2,0.5304327122867107,3,1.9752818793058395,5,6,...,4,3.880873799324036,16,23,0,0,3,2.548801217228174,10.633577197790146,5.360886946320534


In [18]:
EPL_player_df.columns

Index(['id', 'player_name', 'games_x', 'time_x', 'goals_3wks', 'xG_3wks',
       'assists_3wks', 'xA_3wks', 'shots_3wks', 'key_passes_3wks',
       'yellow_cards_x', 'red_cards_x', 'position', 'team_title', 'npg_3wks',
       'npxG_3wks', 'xGChain_x', 'xGBuildup_x', 'games_y', 'time_y',
       'goals_5wks', 'xG_5wks', 'assists_5wks', 'xA_5wks', 'shots_5wks',
       'key_passes_5wks', 'yellow_cards_y', 'red_cards_y', 'npg_5wks',
       'npxG_5wks', 'xGChain_y', 'xGBuildup_y', 'games_x', 'time_x',
       'goals_10wks', 'xG_10wks', 'assists_10wks', 'xA_10wks', 'shots_10wks',
       'key_passes_10wks', 'yellow_cards_x', 'red_cards_x', 'npg_10wks',
       'npxG_10wks', 'xGChain_x', 'xGBuildup_x', 'games_y', 'time_y',
       'goals_season', 'xG_season', 'assists_season', 'xA_season',
       'shots_season', 'key_passes_season', 'yellow_cards_y', 'red_cards_y',
       'npg_season', 'npxG_season', 'xGChain_y', 'xGBuildup_y'],
      dtype='object')

In [17]:
EPL_player_df.rename(columns = {'games_x' : 'games_in_3wks',
                                'time_x' :'games_in_3wks',
                                'yellow_cards_x': 'yellow_cards_3wks',
                                'red_cards_x' : 'red_cards_3wks',
                                'xGChain_x': 'xGChain_3wks',
                                'xGBuildup_x': 'xGBuildup_3wks', 
                                'games_y' : 'games_season', 
                                'time_y' : 'time_season',
                                'yellow_cards_y': 'yellow_cards_season',
                                'red_cards_y' : 'red_cards_season',
                                'xGChain_y': 'xGChain_season',
                                'xGBuildup_y': 'xGBuildup_season',
                                'games_x' : 'games_in_3wks',
                                'time_x' :'games_in_3wks',
                                'yellow_cards_x': 'yellow_cards_3wks',
                                'red_cards_x' : 'red_cards_3wks',
                                'xGChain_x': 'xGChain_3wks',
                                'xGBuildup_x': 'xGBuildup_3wks', 
                                'games_y' : 'games_season', 
                                'time_y' : 'time_season',
                                'yellow_cards_y': 'yellow_cards_season',
                                'red_cards_y' : 'red_cards_season',
                                'xGChain_y': 'xGChain_season',
                                'xGBuildup_y': 'xGBuildup_season'})

Unnamed: 0,id,player_name,games_in_3wks,games_in_3wks.1,goals_3wks,xG_3wks,assists_3wks,xA_3wks,shots_3wks,key_passes_3wks,...,assists_season,xA_season,shots_season,key_passes_season,yellow_cards_season,red_cards_season,npg_season,npxG_season,xGChain_season,xGBuildup_season
0,531,Michail Antonio,3,251,6,4.472343921661377,0,0.6665154322981834,15,5,...,3,3.9214953761547804,67,26,2,0,9,12.116022041067481,13.587349098175764,2.270155580714345
1,618,Raheem Sterling,3,198,5,3.111586093902588,0,0.23588834702968597,11,3,...,1,6.815871397033334,95,45,5,0,19,17.2494806535542,29.749909579753876,9.796311892569065
2,647,Harry Kane,3,270,4,2.6586690545082092,0,1.0790136978030205,11,6,...,2,3.1170063093304634,81,27,4,0,15,11.688250295817852,16.768101127818227,3.0513013089075685
3,574,Troy Deeney,3,227,3,2.3645221292972565,0,0.33117732405662537,5,4,...,2,3.3642509896308184,41,28,3,0,5,6.452236630022526,10.622578946873546,2.433555162511766
4,7722,Trézéguet,3,246,3,1.0186635255813599,0,0.021080465987324715,6,1,...,1,0.958256496116519,48,21,2,1,6,5.24549674987793,6.541662577539682,2.6114314906299114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,8480,Max Thompson,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
507,8493,Jake Vokins,1,90,0,0.013284161686897278,0,0.09659572690725327,1,1,...,0,0.09659572690725327,1,1,0,0,0,0.013284161686897278,0.5018613934516907,0.4885772168636322
508,8496,Tommy Doyle,1,13,0,0.07760051637887955,0,0,1,0,...,0,0,1,0,0,0,0,0.07760051637887955,0.07760051637887955,0
509,8562,Luke Thomas,2,180,0,0,1,0.43036194145679474,0,6,...,1,0.43036194145679474,0,6,0,0,0,0,0.9987059235572815,0.9753741323947906
