In [2]:
# Import the necessary packages.
import time
import os
import json
from bs4 import BeautifulSoup
import requests

In [3]:
# Define useful functions.
def get_seasons_and_leagues():
    """
    This function gets the html code of the page 
    'https://understat.com/league/EPL'. From there it extracts the multiple
    seasons and leagues that they have available in the webpage.
    
    Inputs:
    None
    
    Returns:
    (seasons, leagues): Tuple with two sets. The first conains the leagues, 
        the names of the leagues have the spaces replaced for underscores.
        The second is a set with the seasons.
    """
    # Obtain the source code of tha webpage.
    html_code = requests.get('https://understat.com/league/EPL').text
    # Obtain the soup.
    soup = BeautifulSoup(html_code, 'html.parser')
    
    # As the objects we are searching are children of a select tag with the names
    # league and season. Obtain all the tags option from both selects.
    html_leagues = soup.find('select', {'name':'league'}).find_all('option')
    html_seasons = soup.find('select', {'name':'season'}).find_all('option')
    
    # Extract the value from both lists just generated.
    leagues = set([league['value'].replace(' ','_') for league in html_leagues])
    seasons = set([season['value'] for season in html_seasons])
    
    # Return both sets.
    return (seasons, leagues)


def generate_link_to_scrapp(league, season):
    """
    This function takes a league and a season and generates a link
    to scrap. This is possible because the links always have the same 
    format of 'https://understat.com/league/league/season'
    
    Inputs:
    league(str): League to scrap.
    season(str): Set of seasons availabe in the page.
    
    Return:
    link(str): Link to scrap.
    """
    # Assign the base link to a variable.
    base_link = 'https://understat.com/league/{}/{}'
    
    # Create the link to scrap and add it to the set.
    return base_link.format(league, season)

In [4]:
%%time

# Get the available seasons and leagues.
seasons, leagues = get_seasons_and_leagues()

# Iterate over all the leagues and seasons to generate all the links.
for season in seasons:
    for league in leagues:
        # Generate the link to scrapp.
        link = generate_link_to_scrapp(league, season)
        # Obtain the html_code from the webpage.
        html_code = requests.get(link).text
        # Generate the soup to navigate the html tree.
        soup = BeautifulSoup(html_code, 'html.parser')
        # Find the div tag with id league-players, move up to the parent
        # and then select the script. Finally, return the text.
        script = soup.find('div', {'id':'league-players'}).parent.find('script').text
        
        # Locate the string (' into the script.
        str_start = script.index("('") + 2
        # Locate the string ') into the script.
        str_end = script.index("')")
        # From the script locate the json data that we need to save.
        json_data = script[str_start:str_end]
        # Encode the json data and transform to a dictionary.
        json_data = json_data.encode('utf8').decode('unicode_escape')
        players = json.loads(json_data)
        
        # Open a csv, the a specifies append the data.
        with open('estadïstiques_jugadors_futbol_europa.csv', 'a') as f:
            # Add the header.
            f.write('league,season,player_name,games,time,goals,xG,assists,xA,shots,'
                    'key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,'
                    'xGChain},xGBuildup\n')
            # For every player in the table scraped.
            for player in players:
                # Add the league and season to the dictionary.
                player['league'] = league
                player['season'] = season
                # Generate the new line.
                new_line =('{league},{season},{player_name},{games},{time},{goals},'
                           '{xG},{assists},{xA},{shots},{key_passes},{yellow_cards},'
                           '{red_cards},{position},{team_title},{npg},{npxG},{xGChain},'
                           '{xGBuildup}\n').format(**player)
                # Add it to the file.
                f.write(new_line)

CPU times: user 1.84 s, sys: 235 ms, total: 2.07 s
Wall time: 43.3 s
