Note: \
It might be much faster using asynchronous programming and parsel: import asyncio, import aiohttp, from parsel import Selector, lxml was faster too \
Asyncio with Multiprocessing do this for faster return time for later use \
Even merged parallel (right after a df is returned) instead of merging one of byafter we get all the dataframe at the end


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import concurrent.futures
import time

In [26]:
def getAllClubLinks(url):
    
    #get the response
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    
    #select the first table and find all anchor tags
    teamTable = soup.select("table.stats_table")[0]
    teamLinks = teamTable.find_all("a")
    
    #get if link has href and and "/squads/"
    teamLinks = [l.get("href") for l in teamLinks]
    teamLinks = [l for l in teamLinks if "/squads/" in l]
    teamLinks = [f"https://fbref.com{l}" for l in teamLinks]
    return teamLinks

In [27]:
def getAllClubData(linkToLeague):
    
    # runs only once for all clubs, returns team links
    allClubLinks = getAllClubLinks(linkToLeague)
    names = {"Comp": "Competition", "Poss": "Possession", "Sh":"Shots", "SoT":"Shots Target", "CrdY":"Yellow", "CrdR":"Red", "Fls":"Fouls","Off":"Offside"}
    
    result = pd.DataFrame()
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        
        #submit each club processing as a separate task
        #first arg of submit is the func to be executed in parallel and the subseqent are the args to that function
        club_tasks = [executor.submit(getClubData, clubLink) for clubLink in allClubLinks]
        
        #process the completed tasks as they finish
        for future in concurrent.futures.as_completed(club_tasks):
            club_data = future.result()
            if club_data is not None:
                
                #concatenate current club_data with the result dataframe
                result = pd.concat([result, club_data], axis=0)  
                
    #rename columns     
    result.rename(columns=names, inplace=True)  
    return result


def getClubDataDiffPage(requestData, param, match, keepList):
    soup = BeautifulSoup(requestData.text)
    
    #find all achors tags with href and params in it
    all_links = soup.find_all("a")
    links = [l.get("href") for l in all_links]
    links = [l for l in links if l and param in l]
    links = "https://fbref.com" + links[0]
    
    #to comply with rate limit, randomized for each parallel processor so that they don
    time.sleep(random.randint(10,30))
    response = requests.get(links)
    response = pd.read_html(response.text, match=match)[0]

    #drop one of the two header levels
    response.columns = response.columns.droplevel()
    response = response[keepList]

    #the last row contains overall statics, so ignored
    return response[:-1]

def getClubData(linkToClub):
    time.sleep(random.randint(10,30))
    clubResponse = requests.get(linkToClub)
    clubName = linkToClub.split("/")[-1].replace("-Stats","").replace("-", " ")

    # get Scores (on the same page as first crawled page)
    scores = pd.read_html(clubResponse.text, match = "Scores & Fixtures")[0]

    #only keep these columns
    scores = scores[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss']]

    #get shooting and other stat
    shooting = getClubDataDiffPage(clubResponse, "/all_comps/shooting/", "Shooting", ['Date', 'Sh', "SoT"])
    time.sleep(random.randint(10,30))
    misc = getClubDataDiffPage(clubResponse, "/all_comps/misc/", "Miscellaneous", ['Date', 'CrdY', 'CrdR', 'Fls', 'Off'])
    finalClubData = scores.merge(shooting, how='left').merge(misc, how='left')
    finalClubData = finalClubData[finalClubData["Competition"] == "Premier League"]
    finalClubData["Team"] = clubName
    return finalClubData

url = "https://fbref.com/en/comps/9/Premier-League-Stats"
res = getAllClubData(url)

<!-- <a href="/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/keeper/Manchester-City-Match-Logs-All-Competitions">Goalkeeping</a>  /

<a href="/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/passing/Manchester-City-Match-Logs-All-Competitions">Passing</a>

<a href="/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions">Shooting</a>
 -->