Players Data Scraping

In [1]:
import requests
import json
import pandas as pd
import os

In [2]:
season_names = ['2014', '2015', '2016', '2017', '2018', '2019']
gws = ['3', '5', '10']
leagues = ['EPL', 'La_liga', 'Bundesliga', 'Serie_A', 'Ligue_1']

In [3]:
def scrape_understat(payload):
    #Build request using url, headers (mimicking what Firefox does normally)
    #Works best with verify=True as you won't get the ssl errors. Payload is 
    #taylored for each request
    url = 'https://understat.com/main/getPlayersStats/'
    headers = {'content-type':'application/json; charset=utf-8',
    'Host': 'understat.com',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '39',
    'Origin': 'https: // understat.com',
    'Connection': 'keep - alive',
    'Referer': 'https: // understat.com / league / EPL'
    }
    response = requests.post(url, data=payload, headers = headers, verify=True)
    response_json = response.json()
    inner_wrapper = response_json['response']
    json_player_data = inner_wrapper['players']
    return json_player_data

In [4]:
def clean_df(player_df, weeks):
    # Get rid of the columns that we don't care about
    #player_df.drop(['yellow_cards','red_cards', 'xGChain','xGBuildup','games','time'], axis=1, inplace=True)
    player_df  = player_df.rename(columns={'goals':'goals_'+weeks,'xG':'xG_'+weeks,'assists':'assists_'+weeks, 'xA':'xA_'+weeks, 'shots':'shots_'+weeks, 'key_passes':
        'key_passes_'+weeks,'npg':'npg_'+weeks,'npxG':'npxG_'+weeks})
    
    return(player_df)

In [5]:
def gw_data(season , league,  no_of_gw):
#     Create Pandas dataframes from each html table
    print('Getting data for last {} matches'.format(no_of_gw))
    json_player_data = scrape_understat({'league':'EPL', 'season':season, 'n_last_matches': no_of_gw})
    gw_table = pd.DataFrame(json_player_data)
    gw_df = clean_df(gw_table,'3wks')
    #Replace Position indentifiers with something more useful
    gw_df['position'] = gw_df['position'].str.slice(0,1)
    position_map = {'D':'DEF', 'F':'FWD', 'M':'MID', 'G':'GK', 'S':'FWD'}
    gw_df = gw_df.replace({'position': position_map})
    gw_df.to_csv(r'E:\AMotefaker\ABC\FPA\Web Scraping\Understat Data for Teams-Players (2014- present)\Data\Player_Data\gw_data\last_{}_gw_data.csv'.format(no_of_gw), encoding='utf-8', index=False)
    print('last {} matches csv data written'.format(no_of_gw))
    return gw_df

In [8]:
last_3_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[0])
last_5_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[1])
last_10_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[2])

Getting data for last 3 matches
last 3 matches csv data written
Getting data for last 5 matches
last 5 matches csv data written
Getting data for last 10 matches
last 10 matches csv data written
