In [94]:
import requests
import json
from bs4 import BeautifulSoup
import re
import codecs
import pandas as pd
import os

def folder_creation(league):
    if not os.path.isdir(league):
        os.mkdir(league)
        

def get_script(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception("Response was code " + str(response.status_code))
    html = response.text
    parsed_html = BeautifulSoup(html, 'html.parser')
    scripts = parsed_html.findAll('script')
    filtered_scripts = []
    for script in scripts:
        if len(script.contents) > 0:
            filtered_scripts += [script]
    return scripts

def get_team(url):
    scripts = get_script(url)
    teamData = {}
    playerData = {}
    for script in scripts:
        for c in script.contents:
            split_data = c.split('=')
            data = split_data[0].strip()
            if data == 'var teamsData':
                content = re.findall(r'JSON\.parse\(\'(.*)\'\)',split_data[1])
                decoded_content = codecs.escape_decode(content[0], "hex")[0].decode('utf-8')
                teamData = json.loads(decoded_content)
            elif data == 'var playersData':
                content = re.findall(r'JSON\.parse\(\'(.*)\'\)',split_data[1])
                decoded_content = codecs.escape_decode(content[0], "hex")[0].decode('utf-8')
                playerData = json.loads(decoded_content)
    return teamData, playerData

def parse_league_data(league,year):
    url = f"https://understat.com/league/{league}/{year}"
    folder_creation(league)
    year_folder = os.path.join(league,year)
    folder_creation(year_folder)
    
    teamData,playerData = get_team(url)
    new_team_data = []
    for t,v in teamData.items():
        new_team_data += [v]
    for data in new_team_data:
        team_frame = pd.DataFrame.from_records(data["history"])
        team = data["title"].replace(' ', '_')
        
        team_frame.to_csv(os.path.join(year_folder,team) + ".csv", index=False)
    player_frame = pd.DataFrame.from_records(playerData)

    player_frame.to_csv(os.path.join(year_folder,year) + 'understat_player.csv', index=False)

def main():
    leagues = ["EPL", "La_liga", "Bundesliga", "Serie_A", "Ligue_1"]
    years = ["2014", "2015", "2016", "2017", "2018", "2019"]
    for league in leagues:
        for year in years:
            parse_league_data(league,year)

if __name__ == '__main__':
    main()

In [100]:
def get_player_data(url):
    scripts = get_url(url)
    teamData = {}
    playerData = {}
    for script in scripts:
        for c in script.contents:
            split_data = c.split('=')
            data = split_data[0].strip()
            if data == 'var playersData':
                content = re.findall(r'JSON\.parse\(\'(.*)\'\)',split_data[1])
                decoded_content = codecs.escape_decode(content[0], "hex")[0].decode('utf-8')
                playerData = json.loads(decoded_content)
    return playerData

In [101]:
get_player_data("https://understat.com/league/La_liga/2018")

[{'id': '2097',
  'player_name': 'Lionel Messi',
  'games': '34',
  'time': '2704',
  'goals': '36',
  'xG': '25.997169069945812',
  'assists': '13',
  'xA': '15.33516551926732',
  'shots': '170',
  'key_passes': '93',
  'yellow_cards': '3',
  'red_cards': '0',
  'position': 'F S',
  'team_title': 'Barcelona',
  'npg': '32',
  'npxG': '22.28090887516737',
  'xGChain': '38.45987746119499',
  'xGBuildup': '10.69879900291562'},
 {'id': '2098',
  'player_name': 'Luis Suárez',
  'games': '33',
  'time': '2832',
  'goals': '21',
  'xG': '24.39443599805236',
  'assists': '6',
  'xA': '7.323390703648329',
  'shots': '112',
  'key_passes': '47',
  'yellow_cards': '5',
  'red_cards': '0',
  'position': 'F S',
  'team_title': 'Barcelona',
  'npg': '17',
  'npxG': '21.421453412622213',
  'xGChain': '36.808273531496525',
  'xGBuildup': '12.378092227503657'},
 {'id': '2370',
  'player_name': 'Karim Benzema',
  'games': '36',
  'time': '2972',
  'goals': '21',
  'xG': '18.669334318488836',
  'assists

In [110]:
folder_creation("EPL", "2019")

In [136]:
parse_league_data("EPL" , "2018")

In [115]:
if not os.path.isdir("Epl"):
        os.mkdir("Epl")

In [116]:
if not os.path.isdir("Epl"):
    m = os.path.join("epl","2019")
    os.mkdir(m)


In [147]:
leagues = ["EPL", "La_liga", "Bundesliga", "Serie_A", "Ligue_1"]
years = ["2014", "2015", "2016", "2017", "2018", "2019"]

In [148]:
for league in leagues:
    for year in years:
        parse_league_data(league,year)