Note: \
It might be much faster using asynchronous programming and parsel: import asyncio, import aiohttp, from parsel import Selector, lxml was faster too \
Asyncio with Multiprocessing do this for faster return time for later use


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [7]:
def getAllClubLinks(url):
    
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    team_table = soup.select("table.stats_table")[0]
    team_links = team_table.find_all("a")
    team_links = [l.get("href") for l in team_links]
    team_links = [l for l in team_links if "/squads/" in l]
    team_links = [f"https://fbref.com{l}" for l in team_links]
    return team_links


In [50]:
def getClubData(url):
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    all_links = soup.find_all("a")
    links = [l.get("href") for l in all_links]
    links = [l for l in links if l and param in l]
    links = "https://fbref.com" + links[0]
    print(links)
    
url = "https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats"
getAllClubLinks(url)

[]

In [None]:
import concurrent.futures
import time

def getAllClubData(url):
    # runs only once for all clubs, returns team links
    allClubLinks = getAllClubLinks(url)
    names = {"Comp": "Competition", "Poss": "Possession", "Sh":"Shots", "SoT":"Shots Target", "CrdY":"Yellow", "CrdR":"Red", "Fls":"Fouls","Off":"Offside"}
    
    allClubData = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit each club processing as a separate task
        club_tasks = [executor.submit(processClubData, link) for link in allClubLinks]
        
        # Wait for all tasks to complete
        concurrent.futures.wait(club_tasks)
        
        # Get results from completed tasks
        for task in club_tasks:
            time.sleep(random.randint(10,30))
            club_data = task.result()
            if club_data is not None:
                allClubData.append(club_data)
            time.sleep(random.randint(10,30))
    
    result = pd.concat(allClubData, axis=0)
    return result

def getClubData(requestData, param, match, keepList):
    soup = BeautifulSoup(requestData.text)
    all_links = soup.find_all("a")
    links = [l.get("href") for l in all_links]
    links = [l for l in links if l and param in l]
    links = "https://fbref.com" + links[0]
    time.sleep(random.randint(10,30))
    shooting = requests.get(links)
    shooting = pd.read_html(shooting.text, match=match)[0]
    shooting.columns = shooting.columns.droplevel()
    shooting = shooting[keepList]
    return shooting[:-1]

def processClubData(link):
    time.sleep(random.randint(10,30))
    club = requests.get(link)
    clubName = link.split("/")[-1].replace("-Stats","").replace("-", " ")
    print(clubName)

    # get Scores (on the same page)
    scores = pd.read_html(club.text, match = "Scores & Fixtures")[0]
    scores = scores[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss']]

    # get shooting and other stat (figure better ways for these two)
    shooting = getClubData(club, "/all_comps/shooting/", "Shooting", ['Date', 'Sh', "SoT"])
    time.sleep(random.randint(10,30))
    misc = getClubData(club, "/all_comps/misc/", "Miscellaneous", ['Date', 'CrdY', 'CrdR', 'Fls', 'Off'])
    names = {"Comp": "Competition", "Poss": "Possession", "Sh":"Shots", "SoT":"Shots Target", "CrdY":"Yellow", "CrdR":"Red", "Fls":"Fouls","Off":"Offside"}
    finalClubData = scores.merge(shooting, how='left').merge(misc, how='left')
    finalClubData.rename(columns=names, inplace=True)
    finalClubData = finalClubData[finalClubData["Competition"] == "Premier League"]
    finalClubData["Team"] = clubName
    print(clubName, finalClubData)
    return finalClubData

url = "https://fbref.com/en/comps/9/Premier-League-Stats"
res = getAllClubData(url)
print(res)

Tottenham Hotspur
Fulham
Brentford
Aston Villa
Liverpool
Arsenal
Chelsea
Newcastle United
Manchester City
Manchester United
Crystal Palace
Brighton and Hove Albion
Aston Villa           Date   Time     Competition         Round  Day Venue Result  GF  GA  \
0   2022-08-06  15:00  Premier League   Matchweek 1  Sat  Away      L   0   2   
1   2022-08-13  12:30  Premier League   Matchweek 2  Sat  Home      W   2   1   
2   2022-08-20  15:00  Premier League   Matchweek 3  Sat  Away      L   1   3   
4   2022-08-28  14:00  Premier League   Matchweek 4  Sun  Home      L   0   1   
5   2022-08-31  19:30  Premier League   Matchweek 5  Wed  Away      L   1   2   
6   2022-09-03  17:30  Premier League   Matchweek 6  Sat  Home      D   1   1   
7   2022-09-16  20:00  Premier League   Matchweek 8  Fri  Home      W   1   0   
8   2022-10-02  16:30  Premier League   Matchweek 9  Sun  Away      D   0   0   
9   2022-10-10  20:00  Premier League  Matchweek 10  Mon  Away      D   1   1   
10  2022-10-16

Wolverhampton Wanderers
Brentford           Date   Time     Competition         Round  Day Venue Result GF GA  \
0   2022-08-07  14:00  Premier League   Matchweek 1  Sun  Away      D  2  2   
1   2022-08-13  17:30  Premier League   Matchweek 2  Sat  Home      W  4  0   
2   2022-08-20  15:00  Premier League   Matchweek 3  Sat  Away      L  2  3   
4   2022-08-27  15:00  Premier League   Matchweek 4  Sat  Home      D  1  1   
5   2022-08-30  19:30  Premier League   Matchweek 5  Tue  Away      D  1  1   
6   2022-09-03  15:00  Premier League   Matchweek 6  Sat  Home      W  5  2   
7   2022-09-18  12:00  Premier League   Matchweek 8  Sun  Home      L  0  3   
8   2022-10-01  15:00  Premier League   Matchweek 9  Sat  Away      D  0  0   
9   2022-10-08  15:00  Premier League  Matchweek 10  Sat  Away      L  1  5   
10  2022-10-14  20:00  Premier League  Matchweek 11  Fri  Home      W  2  0   
11  2022-10-19  19:30  Premier League  Matchweek 12  Wed  Home      D  0  0   
12  2022-10-23  14

Newcastle United           Date   Time     Competition         Round  Day Venue Result GF GA  \
0   2022-08-06  15:00  Premier League   Matchweek 1  Sat  Home      W  2  0   
1   2022-08-13  15:00  Premier League   Matchweek 2  Sat  Away      D  0  0   
2   2022-08-21  16:30  Premier League   Matchweek 3  Sun  Home      D  3  3   
4   2022-08-28  14:00  Premier League   Matchweek 4  Sun  Away      D  1  1   
5   2022-08-31  20:00  Premier League   Matchweek 5  Wed  Away      L  1  2   
6   2022-09-03  15:00  Premier League   Matchweek 6  Sat  Home      D  0  0   
7   2022-09-17  15:00  Premier League   Matchweek 8  Sat  Home      D  1  1   
8   2022-10-01  15:00  Premier League   Matchweek 9  Sat  Away      W  4  1   
9   2022-10-08  15:00  Premier League  Matchweek 10  Sat  Home      W  5  1   
10  2022-10-16  14:00  Premier League  Matchweek 11  Sun  Away      D  0  0   
11  2022-10-19  19:30  Premier League  Matchweek 12  Wed  Home      W  1  0   
12  2022-10-23  16:30  Premier Leag

Brighton and Hove Albion           Date   Time     Competition         Round  Day Venue Result GF GA  \
0   2022-08-07  14:00  Premier League   Matchweek 1  Sun  Away      W  2  1   
1   2022-08-13  15:00  Premier League   Matchweek 2  Sat  Home      D  0  0   
2   2022-08-21  14:00  Premier League   Matchweek 3  Sun  Away      W  2  0   
4   2022-08-27  15:00  Premier League   Matchweek 4  Sat  Home      W  1  0   
5   2022-08-30  19:30  Premier League   Matchweek 5  Tue  Away      L  1  2   
6   2022-09-04  14:00  Premier League   Matchweek 6  Sun  Home      W  5  2   
7   2022-10-01  15:00  Premier League   Matchweek 9  Sat  Away      D  3  3   
8   2022-10-08  17:30  Premier League  Matchweek 10  Sat  Home      L  0  1   
9   2022-10-14  20:00  Premier League  Matchweek 11  Fri  Away      L  0  2   
10  2022-10-18  19:30  Premier League  Matchweek 12  Tue  Home      D  0  0   
11  2022-10-22  15:00  Premier League  Matchweek 13  Sat  Away      L  1  3   
12  2022-10-29  15:00  Prem

Bournemouth
Manchester City           Date   Time     Competition         Round  Day Venue Result   GF  \
1   2022-08-07  16:30  Premier League   Matchweek 1  Sun  Away      W  2.0   
2   2022-08-13  15:00  Premier League   Matchweek 2  Sat  Home      W  4.0   
3   2022-08-21  16:30  Premier League   Matchweek 3  Sun  Away      D  3.0   
4   2022-08-27  15:00  Premier League   Matchweek 4  Sat  Home      W  4.0   
5   2022-08-31  19:30  Premier League   Matchweek 5  Wed  Home      W  6.0   
6   2022-09-03  17:30  Premier League   Matchweek 6  Sat  Away      D  1.0   
9   2022-09-17  12:30  Premier League   Matchweek 8  Sat  Away      W  3.0   
10  2022-10-02  14:00  Premier League   Matchweek 9  Sun  Home      W  6.0   
12  2022-10-08  15:00  Premier League  Matchweek 10  Sat  Home      W  4.0   
14  2022-10-16  16:30  Premier League  Matchweek 11  Sun  Away      L  0.0   
15  2022-10-22  15:00  Premier League  Matchweek 13  Sat  Home      W  3.0   
17  2022-10-29  12:30  Premier Leagu

Manchester United           Date   Time     Competition         Round  Day Venue Result GF GA  \
0   2022-08-07  14:00  Premier League   Matchweek 1  Sun  Home      L  1  2   
1   2022-08-13  17:30  Premier League   Matchweek 2  Sat  Away      L  0  4   
2   2022-08-22  20:00  Premier League   Matchweek 3  Mon  Home      W  2  1   
3   2022-08-27  12:30  Premier League   Matchweek 4  Sat  Away      W  1  0   
4   2022-09-01  20:00  Premier League   Matchweek 5  Thu  Away      W  1  0   
5   2022-09-04  16:30  Premier League   Matchweek 6  Sun  Home      W  3  1   
8   2022-10-02  14:00  Premier League   Matchweek 9  Sun  Away      L  3  6   
10  2022-10-09  19:00  Premier League  Matchweek 10  Sun  Away      W  2  1   
12  2022-10-16  14:00  Premier League  Matchweek 11  Sun  Home      D  0  0   
13  2022-10-19  20:15  Premier League  Matchweek 12  Wed  Home      W  2  0   
14  2022-10-22  17:30  Premier League  Matchweek 13  Sat  Away      D  1  1   
16  2022-10-30  16:15  Premier Lea

Nottingham Forest
Leeds United
West Ham United           Date   Time     Competition         Round  Day Venue Result GF GA  \
0   2022-08-07  16:30  Premier League   Matchweek 1  Sun  Home      L  0  2   
1   2022-08-14  14:00  Premier League   Matchweek 2  Sun  Away      L  0  1   
3   2022-08-21  14:00  Premier League   Matchweek 3  Sun  Home      L  0  2   
5   2022-08-28  14:00  Premier League   Matchweek 4  Sun  Away      W  1  0   
6   2022-08-31  19:45  Premier League   Matchweek 5  Wed  Home      D  1  1   
7   2022-09-03  15:00  Premier League   Matchweek 6  Sat  Away      L  1  2   
10  2022-09-18  14:15  Premier League   Matchweek 8  Sun  Away      L  0  1   
11  2022-10-01  17:30  Premier League   Matchweek 9  Sat  Home      W  2  0   
13  2022-10-09  14:00  Premier League  Matchweek 10  Sun  Home      W  3  1   
15  2022-10-16  14:00  Premier League  Matchweek 11  Sun  Away      D  1  1   
16  2022-10-19  19:30  Premier League  Matchweek 12  Wed  Away      L  0  1   
17  2

Nottingham Forest           Date   Time     Competition         Round  Day Venue Result GF GA  \
0   2022-08-06  15:00  Premier League   Matchweek 1  Sat  Away      L  0  2   
1   2022-08-14  14:00  Premier League   Matchweek 2  Sun  Home      W  1  0   
2   2022-08-20  15:00  Premier League   Matchweek 3  Sat  Away      D  1  1   
4   2022-08-28  16:30  Premier League   Matchweek 4  Sun  Home      L  0  2   
5   2022-08-31  19:30  Premier League   Matchweek 5  Wed  Away      L  0  6   
6   2022-09-03  15:00  Premier League   Matchweek 6  Sat  Home      L  2  3   
7   2022-09-16  20:00  Premier League   Matchweek 8  Fri  Home      L  2  3   
8   2022-10-03  20:00  Premier League   Matchweek 9  Mon  Away      L  0  4   
9   2022-10-10  20:00  Premier League  Matchweek 10  Mon  Home      D  1  1   
10  2022-10-15  15:00  Premier League  Matchweek 11  Sat  Away      L  0  1   
11  2022-10-18  19:30  Premier League  Matchweek 12  Tue  Away      D  0  0   
12  2022-10-22  12:30  Premier Lea

Everton           Date   Time     Competition         Round  Day Venue Result  GF  GA  \
0   2022-08-06  17:30  Premier League   Matchweek 1  Sat  Home      L   0   1   
1   2022-08-13  12:30  Premier League   Matchweek 2  Sat  Away      L   1   2   
2   2022-08-20  15:00  Premier League   Matchweek 3  Sat  Home      D   1   1   
4   2022-08-27  15:00  Premier League   Matchweek 4  Sat  Away      D   1   1   
5   2022-08-30  20:00  Premier League   Matchweek 5  Tue  Away      D   1   1   
6   2022-09-03  12:30  Premier League   Matchweek 6  Sat  Home      D   0   0   
7   2022-09-18  14:15  Premier League   Matchweek 8  Sun  Home      W   1   0   
8   2022-10-01  15:00  Premier League   Matchweek 9  Sat  Away      W   2   1   
9   2022-10-09  19:00  Premier League  Matchweek 10  Sun  Home      L   1   2   
10  2022-10-15  17:30  Premier League  Matchweek 11  Sat  Away      L   0   2   
11  2022-10-19  19:30  Premier League  Matchweek 12  Wed  Away      L   0   1   
12  2022-10-22  15:0

Leeds United           Date   Time     Competition         Round  Day Venue Result  GF  GA  \
0   2022-08-06  15:00  Premier League   Matchweek 1  Sat  Home      W   2   1   
1   2022-08-13  15:00  Premier League   Matchweek 2  Sat  Away      D   2   2   
2   2022-08-21  14:00  Premier League   Matchweek 3  Sun  Home      W   3   0   
4   2022-08-27  15:00  Premier League   Matchweek 4  Sat  Away      L   0   1   
5   2022-08-30  20:00  Premier League   Matchweek 5  Tue  Home      D   1   1   
6   2022-09-03  15:00  Premier League   Matchweek 6  Sat  Away      L   2   5   
7   2022-10-02  16:30  Premier League   Matchweek 9  Sun  Home      D   0   0   
8   2022-10-09  14:00  Premier League  Matchweek 10  Sun  Away      L   1   2   
9   2022-10-16  14:00  Premier League  Matchweek 11  Sun  Home      L   0   1   
10  2022-10-20  20:15  Premier League  Matchweek 12  Thu  Away      L   0   2   
11  2022-10-23  14:00  Premier League  Matchweek 13  Sun  Home      L   2   3   
12  2022-10-29 

In [None]:
#get links to each club
def getAllClubLinks(url):
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    team_table = soup.select("table.stats_table")[0]
    team_links = team_table.find_all("a")
    team_links = [l.get("href") for l in team_links]
    team_links = [l for l in team_links if "/squads/" in l]
    team_links = [f"https://fbref.com{l}" for l in team_links]
    return team_links

def getClubData(requestData, param, match, keepList):
    soup = BeautifulSoup(requestData.text)
    all_links = soup.find_all("a")
    links = [l.get("href") for l in all_links]
    links = [l for l in links if l and param in l]
    links = "https://fbref.com" + links[0]
    shooting = requests.get(links)
    shooting = pd.read_html(shooting.text, match=match)[0]
    shooting.columns = shooting.columns.droplevel()
    shooting = shooting[keepList]
    return shooting[:-1]

def getAllClubData(url):
    
    #runs only once for all clubs, returns team links
    allClubLinks = getAllClubLinks(url)
    names = {"Comp": "Competition", "Poss": "Possession", "Sh":"Shots", "SoT":"Shots Target", "CrdY":"Yellow", "CrdR":"Red", "Fls":"Fouls","Off":"Offside"}
    
    allClubData = []
    for link in allClubLinks:
        club = requests.get(link)
        clubName = link.split("/")[-1].replace("-Stats","").replace("-", " ")
        print(clubName)
        
        #get Scores (on the same page)
        scores = pd.read_html(club.text, match = "Scores & Fixtures")[0]
        scores = scores[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss']]
        
        #get shooting and other stat (figure better ways for these two)
        shooting = getClubData(club, "/all_comps/shooting/", "Shooting", ['Date', 'Sh', "SoT"])
        misc = getClubData(club, "/all_comps/misc/", "Miscellaneous", ['Date', 'CrdY', 'CrdR', 'Fls', 'Off'])
        
        finalClubData = scores.merge(shooting,how ='left').merge(misc, how='left')
        allClubData.append(finalClubData)
        time.sleep(10)
#     keepList = ['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss', 'Sh', "SoT", 'CrdY', 'CrdR', 'Fls', 'Off']
    finalClubData.rename(columns=names, inplace=True)
    finalClubData = finalClubData[finalClubData["Competition"] == "Premier League"]
    finalClubData["Team"] = clubName
    result = pd.concat(allClubData, axis=0)
    return result

url = "https://fbref.com/en/comps/9/Premier-League-Stats"
getAllClubData(url)

In [12]:
#get links to each club
def getAllClubLinks(url):
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    team_table = soup.select("table.stats_table")[0]
    team_links = team_table.find_all("a")
    team_links = [l.get("href") for l in team_links]
    team_links = [l for l in team_links if "/squads/" in l]
    team_links = [f"https://fbref.com{l}" for l in team_links]
    return team_links

url = "https://fbref.com/en/comps/9/Premier-League-Stats"
getAllClubLinks(url)

In [24]:
def getClubData(requestData, param, match, keepList):
    soup = BeautifulSoup(requestData.text)
    all_links = soup.find_all("a")
    links = [l.get("href") for l in all_links]
    links = [l for l in links if l and param in l]
    links = "https://fbref.com" + links[0]
    shooting = requests.get(links)
    shooting = pd.read_html(shooting.text, match=match)[0]
    shooting.columns = shooting.columns.droplevel()
#     shooting = shooting[keepList]
    return shooting[:-1]

In [None]:
# def getAllClubData(url):
    
#     #runs only once for all clubs, returns team links
#     allClubLinks = getAllClubLinks(url)
#     names = {"Comp": "Competition", "Poss": "Possession", "Sh":"Shots", "SoT":"Shots Target", "CrdY":"Yellow", "CrdR":"Red", "Fls":"Fouls","Off":"Offside"}
    
#     allClubData = []
#     for link in allClubLinks:
#         club = requests.get(link)
#         clubName = link.split("/")[-1].replace("-Stats","").replace("-", " ")
#         print(clubName)
        
#         #get Scores
#         scores = pd.read_html(club.text, match = "Scores & Fixtures")[0]
#         scores = scores[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss']]
        
#         #get shooting and other stat (figure better ways for these two)
#         shooting = getClubData(club, "/all_comps/shooting/", "Shooting", ['Date', 'Sh', "SoT"])
#         misc = getClubData(club, "/all_comps/misc/", "Miscellaneous", ['Date', 'CrdY', 'CrdR', 'Fls', 'Off'])
        
#         finalClubData = scores.merge(shooting,how ='left').merge(misc, how='left')
#         allClubData.append(finalClubData)
#         time.sleep(10)
# #     keepList = ['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss', 'Sh', "SoT", 'CrdY', 'CrdR', 'Fls', 'Off']
# #     finalClubData.rename(columns=names, inplace=True)
#     finalClubData = finalClubData[finalClubData["Competition"] == "Premier League"]
#     finalClubData["Team"] = clubName
#     result = pd.concat(allClubData, axis=0)
#     return result

# url = "https://fbref.com/en/comps/9/Premier-League-Stats"
# getAllClubData(url)

In [None]:

# url = "https://fbref.com/en/comps/9/Premier-League-Stats"
# allClubLinks = getClubLinks(url)

# #chose one from all the links
# club = requests.get(allClubLinks[1])
# club_name = allClubLinks[1].split("/")[-1].replace("-Stats","").replace("-", " ")

# #thereisonyl one element
# matches = pd.read_html(club.text, match = "Scores & Fixtures")[0]
# matches.columns
# matches = matches[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss']]


# shooting = getData("/all_comps/shooting/", "Shooting", ['Date', 'Sh', "SoT"])
# misc = getData("/all_comps/misc/", "Miscellaneous", ['Date', 'CrdY', 'CrdR', 'Fls', 'Off'])

# final_data = matches.merge(shooting,how ='left').merge(misc, how='left')
# names = {"Comp": "Competition", "Poss": "Possession", "Sh":"Shots", "SoT":"Shots Target", "CrdY":"Yellow", "CrdR":"Red", "Fls":"Fouls","Off":"Offside"}
# final_data.rename(columns=names, inplace=True)
# final_data
# final_data = final_data[final_data["Competition"] == "Premier League"]
# final_data["Team"] = club_name
# final_data

<!-- <a href="/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/keeper/Manchester-City-Match-Logs-All-Competitions">Goalkeeping</a>  /

<a href="/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/passing/Manchester-City-Match-Logs-All-Competitions">Passing</a>

<a href="/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions">Shooting</a>
 -->