In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [2]:
def get_match_raw(url):
    #TODO: error handling
    return BeautifulSoup(requests.get(url).text, "html.parser")

def get_match_schedule(raw):
    return raw.find_all("time")[0].string

def get_venue_name(raw):
    return raw.find_all("span", {"class": "sdc-site-match-header__detail-venue"})[0].string

def get_match_title(raw):
    return raw.title.string

def get_team_names(title, title_regex):
    title_group = title_regex.match(title)
    return title_group.group('teamA'), title_group.group('teamB')

def get_team_scores(title, title_regex):
    title_group = title_regex.match(title)
    return title_group.group('teamA_score'), title_group.group('teamB_score')

def get_winner_details(record):
    if(record['teamA_score'] > record['teamB_score']):
        return record['teamA'], record['teamA_score']
    else:
        return record['teamB'], record['teamB_score']

In [3]:
base_url = 'https://www.skysports.com/premier-league-results/'

def get_match_details(year_range):

    response = requests.get(base_url+year_range) #Fetch HTML Page
    if(response.status_code == 200):

        soup = BeautifulSoup(response.text, "html.parser") #Parse HTML Page
        urls = soup.find_all("a", {"class": "matches__item matches__link"}, limit=None)

        #TODO: check for pagination
        match_details = []
        for url in urls[:2]: # Testing purpose
            match_details.append(url.attrs['href'])

        match_details = pd.DataFrame(match_details, columns =['url']) 

        match_details['raw'] = match_details['url'].apply(get_match_raw)
        match_details['schedule'] = match_details['raw'].apply(get_match_schedule)
        match_details['venue'] = match_details['raw'].apply(get_venue_name)
        match_details['raw_title'] = match_details['raw'].apply(get_match_title)

        title_regex = re.compile(r'(?P<teamA>[a-zA-Z_ \']*) (?P<teamA_score>\d?\d) - (?P<teamB_score>\d?\d) (?P<teamB>[a-zA-Z_ \']*) -')
        match_details[['teamA', 'teamB']] = pd.DataFrame(match_details['raw_title']
                                                         .apply(lambda x: get_team_names(x, title_regex))
                                                         .tolist(), index=match_details.index)
        match_details[['teamA_score', 'teamB_score']] = pd.DataFrame(match_details['raw_title']
                                                                     .apply(lambda x: get_team_scores(x, title_regex))
                                                                     .tolist(), index=match_details.index)
        match_details[['winner_Team','winner_score']] = pd.DataFrame(match_details
                                                                     .apply(get_winner_details, axis=1)
                                                                     .tolist(), index=match_details.index)  
        return match_details
    else:
        print(f'{base_url} request failed with status code {response.status_code}')

In [4]:
chromedriver_path = 'C:/webdrivers/chromedriver'
service = Service(chromedriver_path)

# Link config
google_base_url = 'https://www.google.com/search?q='

In [5]:
class SoccerCovid:
    def __init__(self):
        self.final_data = {}
        pass

    def get_driver(self):
        """
        Returns a new instance of selenium webdriver
        """
        return webdriver.Chrome(service=service)
    
    def get_url(self, param):
        param += ' wikipedia'
        param = param.replace(" ", "+") 
        return ''.join([google_base_url, param])
    
    def get_by_xpath(self, driver, xpath):
        return driver.find_elements(By.XPATH, xpath)
        
    def navigate_to_site(self, driver, url):
        driver.get(url)
        time.sleep(2)
        
    def get_wiki_url(self, driver, param, ext = False):
        param += ' Football Club' if ext else ''
        url = self.get_url(param)
        self.navigate_to_site(driver, url)
        xpath = "//a[contains(@href, 'wikipedia.org/wiki')]"
        url_list = driver.find_elements(By.XPATH, xpath)
        return url_list[0].get_attribute('href')
    
    def get_venue_location(self, match, driver):
        wiki_url = self.get_wiki_url(driver, match['venue'])
        
        self.navigate_to_site(driver, wiki_url)  
        xpath = "//th[contains(text(), 'Location' )]/following-sibling::td"
        return self.get_by_xpath(driver, xpath)[0].text
    
    def get_winner_location(self, match, driver):
        wiki_url = self.get_wiki_url(driver, match['winner_Team'], True)
        self.navigate_to_site(driver, wiki_url)
        
        xpath = "//th[contains(text(), 'Ground' )]/following-sibling::td/a"
        venue_url = self.get_by_xpath(driver, xpath)[0].get_attribute('href')
        
        self.navigate_to_site(driver, venue_url)
        
        xpath = "//th[contains(text(), 'Location' )]/following-sibling::td"
        return self.get_by_xpath(driver, xpath)[0].text
    
    def scrape_pages(self, match_details):
        driver = self.get_driver()
        match_details['venue_location'] = match_details.apply(lambda x: self.get_venue_location(x, driver), axis = 1)
        match_details['winner_location'] = match_details.apply(lambda x: self.get_winner_location(x, driver), axis = 1)
        driver.quit()
        return match_details
    
    def start_scraping(self, match_details_df):
        return self.scrape_pages(match_details_df)

In [6]:
dates = ['2019-20', '2020-21']

for date in dates:
    file_name = 'covid_soccer_' + date + '.csv'
    if not os.path.exists(file_name):
        match_details_df = get_match_details(date)
        result_df = SoccerCovid().start_scraping(match_details_df)
        result_df.drop(columns=['raw','raw_title'], inplace=True)
        result_df.to_csv(file_name)

In [7]:
# # Manual Regex extraction
# url = 'https://www.skysports.com/premier-league-results/2019-20'
# site = str(requests.get(url).content)
# regex = r'<a href="(https:\/\/www\.skysports\.com\/football\/[a-zA-Z-\/]+\d+)" class="matches__item matches__link"'
# result = re.findall(regex, site)