In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC, ui
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import requests
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def button_click(xpath): 
    button = ui.WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, xpath)))
    click = button.find_element(By.XPATH, xpath)
    driver.execute_script("arguments[0].click();", click)

In [3]:
def teams_matching(input_string, string_list):
    max_similarity = 0
    most_similar = None
    for candidate in string_list:
        similarity = fuzz.ratio(input_string.lower(), candidate)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = candidate
    return most_similar

In [4]:
def add_value(match_stats, team, key, value):
    if team not in match_stats:
        match_stats[team] = {}
    if key not in match_stats[team]:
        match_stats[team][key] = []
    match_stats[team][key].append(value)

In [5]:
def stats_wrangling():
    driver.implicitly_wait(10)
    
    table = driver.find_element(By.XPATH, "//tbody[@class='matchCentreStatsContainer']")
    while len(table.text) == 0:
        table = driver.find_element(By.XPATH, "//tbody[@class='matchCentreStatsContainer']")
    soup = BeautifulSoup(table.get_attribute("innerHTML"), 'html.parser')

    ### Reg ex to wrangle html script into a list i.e ->  ['40.3 Possession % 59.7', '1 Shots on target 1'...]
    stats_list = [' '.join([re.sub(r'\s+', ' ', cell.get_text(strip=True)) for cell in row.find_all('td')]) for row in soup.find_all('tr')]
    ###
    
    ### Find anchor element containing name of home/away team
    a_tag = driver.find_elements(By.XPATH, "//table//thead//tr//th//a")
    a_tag = list(filter(lambda x: len(x.text) > 0, a_tag))

    hteam = a_tag[0].text
    ateam = a_tag[1].text
    ### 

    ### Find match score
    score = driver.find_element(By.CSS_SELECTOR, "div.mc-summary__score.js-mc-score").text.split(' - ')
    ###

    ### Find match date
    date = driver.find_element(By.CSS_SELECTOR, "div.mc-summary__info").text
    date = datetime.strptime(date, "%a %d %b %Y").date()
    ###
    
    ### Separating metrics from statistics 
    stats, labels = [], []
    for item in stats_list:
        matches = re.findall(r"(\d+\.\d+|\d+|\D+)", item)
        stats += matches[::2]
        labels += matches[1::2]
    labels = ["Opposition", "Date", "Venue", "Goals Scored", "Goals Conceded"] + list(map(lambda x: x.strip(), labels))
    stats = [ateam, hteam, date, date, 'H', 'A', score[0], score[1], score[1], score[0]] + stats
    hteam_stats = stats[::2]
    ateam_stats = stats[1::2]
    ### 
    
    return hteam, hteam_stats, ateam, ateam_stats, labels

In [6]:
base_url = "https://www.premierleague.com"
table_url = "https://www.premierleague.com/tables"

In [7]:
table_req = requests.get(table_url)

In [8]:
soup = BeautifulSoup(table_req.text)
table_body = soup.select("tbody.league-table__tbody")[0]

In [9]:
links = table_body.find_all('a')

In [10]:
team_links_partial = list(set([i.get("href") for i in links if '/clubs/' in i.get("href")]))

In [11]:
import re
## Automate this
team_list = sorted([re.search(r"/([^/]+)/" + re.escape("overview"), i).group(1) for i in team_links_partial])
team_list.append('Reading')
team_list.append('Bolten')
team_list.append('Sheffield-United')
team_list.append('Charlton')
team_list.append('Sheffield-United')
team_list.append('Middlesbrough')
team_list.append('Portsmouth')
team_list.append('Blackburn-Rovers')
team_list.append('Wigan-Athletic')
team_list.append('Derby-County')
team_list.append('Watford')
team_list = sorted(team_list)
team_list

['Arsenal',
 'Aston-Villa',
 'Blackburn-Rovers',
 'Bolten',
 'Bournemouth',
 'Brentford',
 'Brighton-and-Hove-Albion',
 'Burnley',
 'Charlton',
 'Chelsea',
 'Crystal-Palace',
 'Derby-County',
 'Everton',
 'Fulham',
 'Liverpool',
 'Luton-Town',
 'Manchester-City',
 'Manchester-United',
 'Middlesbrough',
 'Newcastle-United',
 'Nottingham-Forest',
 'Portsmouth',
 'Reading',
 'Sheffield-United',
 'Sheffield-United',
 'Sheffield-United',
 'Tottenham-Hotspur',
 'Watford',
 'West-Ham-United',
 'Wigan-Athletic',
 'Wolverhampton-Wanderers']

In [12]:
df_dict = {'Date': [], 'Opposition': [], 'Venue': [], 'Goals Scored': [], 'Goals Conceded': [], 'Possession %': [], 'Shots on target': [], 'Shots': [], 'Touches': [], 'Passes': [], 'Tackles': [], 'Clearances': [], 'Corners': [], 'Offsides': [], 'Yellow cards': [], 'Fouls conceded': []}

In [13]:
if __name__ == "__main__":
    driver = webdriver.Chrome()
    match_stats = {}
    for i in range(5567, 5650):
        driver.get(f"https://www.premierleague.com/match/{i}")
        
        if i == 5567:
            button_click("//button[@id='onetrust-accept-btn-handler']")
        try:
            button_click("//li[text()='Stats']")
        except:
            continue
    
        hteam, hteam_stats, ateam, ateam_stats, labels = stats_wrangling()
        ateam = teams_matching(ateam, team_list)
        hteam = teams_matching(hteam, team_list)
        for i in range(len(hteam_stats)):
            add_value(match_stats, hteam, labels[i], hteam_stats[i])
            add_value(match_stats, ateam, labels[i], ateam_stats[i])

In [22]:
df = pd.DataFrame(match_stats)
df.drop("Red cards", axis=0, inplace=True)

In [39]:
pd.DataFrame(df['Arsenal'].tolist(), df['Arsenal'].keys()).T

Unnamed: 0,Opposition,Date,Venue,Goals Scored,Goals Conceded,Possession %,Shots on target,Shots,Touches,Passes,Tackles,Clearances,Corners,Offsides,Yellow cards,Fouls conceded
0,Aston Villa,2006-08-19,H,1,1,72.9,7,24,807,631,22,14,18,2,1,10
1,Man City,2006-08-26,A,0,1,62.2,3,23,681,505,30,23,8,3,1,8
2,Middlesbrough,2006-09-09,H,1,1,69.2,9,22,729,581,20,9,14,0,1,8
3,Man Utd,2006-09-17,A,1,0,50.6,6,16,607,427,25,34,5,4,2,13
4,Sheffield Utd,2006-09-23,H,3,0,62.3,6,14,697,499,35,31,7,5,2,13
5,Charlton,2006-09-30,A,2,1,66.0,8,23,743,566,16,28,7,2,4,8
6,Watford,2006-10-14,H,3,0,62.0,7,16,623,413,18,55,8,4,0,8
