In [4]:
# import necessary libraries

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [5]:
# setup browser and url

browser = webdriver.Chrome()

url = 'https://footystats.org'

In [6]:
# open website

browser.get(url)
browser.maximize_window()

# wait until dropdown menu is loaded
WebDriverWait(browser, 20).until(
    EC.presence_of_element_located((By.XPATH, "//*[text()='Leagues']"))
)

<selenium.webdriver.remote.webelement.WebElement (session="379797e1957625999275a980c5af3c7f", element="f.80B673FEE3668759CD4C72B408CB9668.d.62B15F98F72AA2496AD3043FC76983D2.e.115")>

In [7]:
# grab links to chosen leagues from dropdown menu

leagues = ['Premier League', 'La Liga', 'Ligue 1', 'Bundesliga', 'Serie A']

# hover over 'Leagues' dropdown menu
leagues_dropdown = browser.find_element(By.XPATH, "//*[text()='Leagues']")
webdriver.ActionChains(browser).move_to_element(leagues_dropdown).perform()

# find and extract links to chosen leagues from dropdown menu
leagues_links = []
for league in leagues:
    league_link = browser.find_element(By.XPATH, f"//*[text()='{league}']")
    leagues_links.append(league_link.get_attribute('href'))
    
# print out for debugging
for league, link in zip(leagues, leagues_links):
    print(league, link)

Premier League https://footystats.org/england/premier-league
La Liga https://footystats.org/spain/la-liga
Ligue 1 https://footystats.org/france/ligue-1
Bundesliga https://footystats.org/germany/bundesliga
Serie A https://footystats.org/italy/serie-a


### Take all links of clubs from league links

In [8]:
import numpy as np

all_club_links = []

for link in leagues_links:
    browser.get(link)
    WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "table-wrapper")))
    club_elements = browser.find_elements(By.XPATH, "//table//tr//td//a[contains(@href, '/clubs/')]")
    club_links = np.array([club_link.get_attribute('href') for club_link in club_elements])
    all_club_links.extend(club_links)

all_club_links = np.array(list(set(all_club_links)))

print(len(all_club_links))

for link in all_club_links:
    print(link)

96
https://footystats.org/clubs/tottenham-hotspur-fc-92
https://footystats.org/clubs/eintracht-frankfurt-44
https://footystats.org/clubs/1-fsv-mainz-05-34
https://footystats.org/clubs/montpellier-hsc-435
https://footystats.org/clubs/wolverhampton-wanderers-fc-223
https://footystats.org/clubs/fulham-fc-162
https://footystats.org/clubs/brighton-hove-albion-fc-209
https://footystats.org/clubs/deportivo-alaves-277
https://footystats.org/clubs/sv-werder-bremen-37
https://footystats.org/clubs/leicester-city-fc-108
https://footystats.org/clubs/cd-leganes-278
https://footystats.org/clubs/fc-nantes-440
https://footystats.org/clubs/ssc-napoli-74
https://footystats.org/clubs/manchester-united-fc-149
https://footystats.org/clubs/ipswich-town-fc-220
https://footystats.org/clubs/racing-club-de-lens-449
https://footystats.org/clubs/newcastle-united-fc-157
https://footystats.org/clubs/stade-de-reims-451
https://footystats.org/clubs/arsenal-fc-59
https://footystats.org/clubs/nottingham-forest-fc-211
ht

### Take all links of players of 5 leagues

In [9]:
all_players_link = []

for link in all_club_links:
    browser.get(link)
    WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//*[text()='Players']")))
    browser.find_element(By.XPATH, "//*[text()='Players']").click()
    players_elements = browser.find_elements(By.CLASS_NAME, "semi-bold")
    players_links = np.array([player_link.get_attribute('href') for player_link in players_elements])
    all_players_link.extend(players_links)

all_players_link = np.array(list(set(all_players_link)))

print(len(all_players_link))

# for link in all_players_link:
#     print(link)

2601


In [62]:
import re
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
# 
class Player:
    def __init__(self):
        pass
    
    def set_stats(self, name, age, nationality, club, position, height, weight, foot, total_matches, total_goals,
              total_minutes, total_assists, total_yellow, total_red, minutes_per_goal, shot_accuracy,
              pass_completion_rate, cross_completion_rate, minutes_per_assist, dribble_success_rate,
              penalty_conversion_rate, tackles, interception, market_value, titles):
        
        # Basic information
        self.name = name
        self.age = age
        self.nationality = nationality
        self.club = club
        self.position = position
        self.height = height
        self.weight = weight
        self.foot = foot
        
        # Match statistics
        self.total_matches = total_matches
        self.total_goals = total_goals
        self.total_minutes = total_minutes
        self.total_assists = total_assists
        self.total_yellow = total_yellow
        self.total_red = total_red
        
        # Performance metrics
        self.minutes_per_goal = minutes_per_goal
        self.shot_accuracy = shot_accuracy
        self.pass_completion_rate = pass_completion_rate
        self.cross_completion_rate = cross_completion_rate
        self.minutes_per_assist = minutes_per_assist
        self.dribble_success_rate = dribble_success_rate
        self.penalty_conversion_rate = penalty_conversion_rate
        
        # Defensive stats
        self.tackles = tackles
        self.interception = interception
        
        # Market value and titles
        self.market_value = market_value
        self.titles = titles
        

def get_stat_number_int(stat):
    stat_number = int(re.findall(r'\d+', stat)[0])
    return stat_number

def get_stat_number_float(stat):
    stat_number_match = re.findall(r'\d+', stat)
    if (len(stat_number_match) == 2):
        stat_number = float(stat_number_match[0] + '.' + stat_number_match[1])
    elif (len(stat_number_match) == 1):
        stat_number = float(stat_number_match[0])
    else: 
        stat_number = None
    
    return stat_number

In [63]:
# Set up the Selenium WebDriver (make sure you have installed the required driver for your browser, e.g., ChromeDriver)
browser = webdriver.Chrome()
players = []
for link in all_players_link:
    if (type(link) != type("str") or ('https://footystats.org/players'not in link)):
        continue
    # Open the page
    browser.get(link)
    
    # Wait until the div with id 'h2h_content3' is present
    try:
        content_div = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.ID, "h2h_content3"))
        )
    except TimeoutException:
        continue
    # Locate the div with id 'h2h_content1'
    # Get the information from h2hcontent1
    content_div = browser.find_element(By.ID, "h2h_content1")
    name = (content_div.find_element(By.XPATH, './/div[@class="w100 row cf"]/h1[@class="row white heroh1"]').text).split(" Stats")[0].strip() 
    try:
        age = (content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e pb05e mr15e"]/span[@class="semi-bold"]').text).split('(')[0].strip()
    except NoSuchElementException:
        age = None
    nationality = content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e ml15e"][1]/span[@class="semi-bold"]').text
    club = content_div.find_element(By.XPATH, './/div[@class="fl rw50 white fs08e"]/a').text
    position = content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e"][1]/span[@class="semi-bold"]').text
    try:
        height = get_stat_number_int(content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e mr15e"][1]/span[@class="semi-bold"]').text)
    except NoSuchElementException:
        height = None
    try:
        weight = get_stat_number_int(content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e mr15e"][2]/span[@class="semi-bold"]').text)
    except NoSuchElementException:
        weight = None
    try:
        foot = content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e ml15e"][2]/span[@class="semi-bold"]').text
    except NoSuchElementException:
        foot = None

    # Get the information from h2hcontent2
    content_div = browser.find_element(By.ID, "h2h_content2")
    total_matches = content_div.find_element(By.XPATH, './/td[contains(text(), "Matches Played")]/following-sibling::td[1]').text
    total_minutes = content_div.find_element(By.XPATH, './/td[contains(text(), "Minutes")]/following-sibling::td[1]').text
    total_goals = content_div.find_element(By.XPATH, './/td[contains(text(), "Goals Scored")]/following-sibling::td[1]').text
    total_assists = content_div.find_element(By.XPATH, './/td[contains(text(), "Assists")]/following-sibling::td[1]').text
    total_yellow = content_div.find_element(By.XPATH, './/td[contains(text(), " Yellow Cards")]/following-sibling::td[1]').text
    total_red = content_div.find_element(By.XPATH, './/td[contains(text(), " Red Cards")]/following-sibling::td[1]').text
    minutes_per_goal = get_stat_number_int(content_div.find_element(By.XPATH, './/td[contains(text(), "Minutes Per Goal")]/following-sibling::td[1]').text)
    try:
        shot_accuracy = get_stat_number_float(content_div.find_element(By.XPATH, './/td[contains(text(), "Shot Accuracy")]/following-sibling::td[1]').text)
    except NoSuchElementException:
        shot_accuracy = None
    try:
        pass_completion_rate = get_stat_number_float(content_div.find_element(By.XPATH, './/td[contains(text(), "Pass Completion Rate")]/following-sibling::td[1]').text)
    except NoSuchElementException:
        pass_completion_rate = None
    try:
        cross_completion_rate = get_stat_number_float(content_div.find_element(By.XPATH, './/td[contains(text(), "Cross Completion Rate")]/following-sibling::td[1]').text)
    except NoSuchElementException:
        cross_completion_rate = None
    minutes_per_assist = content_div.find_element(By.XPATH, './/td[contains(text(), "Minutes Per Assist")]/following-sibling::td[1]').text
    minutes_per_assist_match = re.findall(r'\d+', minutes_per_assist)
    if (minutes_per_assist_match):
        minutes_per_assist = get_stat_number_int(minutes_per_assist)
    dribble_success_rate = content_div.find_element(By.XPATH, './/td[contains(text(), "Dribble Success Rate")]/following-sibling::td[1]').text
    dribble_success_rate_match = re.findall(r'\d+', dribble_success_rate)
    if(dribble_success_rate_match):
        dribble_success_rate = get_stat_number_float(dribble_success_rate)
    penalty_conversion_rate = content_div.find_element(By.XPATH, './/td[contains(text(), "Penalty Conversion Rate")]/following-sibling::td[1]').text
    penalty_conversion_rate_match = re.findall(r'\d+', penalty_conversion_rate)
    if (penalty_conversion_rate_match):
        penalty_conversion_rate = get_stat_number_float(penalty_conversion_rate)
    try:
        tackles = content_div.find_element(By.XPATH, './/td[contains(text(), "Tackles")]/following-sibling::td[1]').text
    except NoSuchElementException:
        tackles = None
    try:
        interception = content_div.find_element(By.XPATH, './/td[contains(text(), "Interceptions")]/following-sibling::td[1]').text
    except NoSuchElementException:
        interception = None

    # Get the information from h2hcontent3
    content_div = browser.find_element(By.ID, "h2h_content3")
    try:
        market_value_section = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="title-bar"]'))
            )
    except TimeoutException:
        continue
    try:
        market_value = get_stat_number_float(content_div.find_element(By.XPATH, './/h3').text)
    except NoSuchElementException:
        market_value = None
    try:
        titles = get_stat_number_int(content_div.find_element(By.XPATH, './/p[@class="mt1e lh14e header-blue semi-bold fs09e"]').text)
    except NoSuchElementException:
        titles = None
    player = Player()
    player.set_stats(
        name,
        age,
        nationality,
        club,
        position,
        height,
        weight,
        foot,
        total_matches,
        total_goals,
        total_minutes,
        total_assists,
        total_yellow,
        total_red,
        minutes_per_goal,
        shot_accuracy,
        pass_completion_rate,
        cross_completion_rate,
        minutes_per_assist,
        dribble_success_rate,
        penalty_conversion_rate,
        tackles,
        interception,
        market_value,
        titles
    )
    players.append(player)
# print("Name:", name)
# print("Age:", age)
# print("Nationality:", nationality)
# print("Club:", club)
# print("Position:", position)
# print("Height:", height)
# print("Weight:", weight)
# print("Foot:", foot)
# print("Total Matches Played:", total_matches)
# print("Total Minutes:", total_minutes)
# print("Total Goals Scored:", total_goals)
# print("Total Assists:", total_assists)
# print("Total Yellow Cards:", total_yellow)
# print("Total Red Cards:", total_red)
# print("Minutes Per Goal:", minutes_per_goal)
# print("Shot Accuracy:", shot_accuracy)
# print("Pass Completion Rate:", pass_completion_rate)
# print("Cross Completion Rate:", cross_completion_rate)
# print("Minutes Per Assist:", minutes_per_assist)
# print("Dribble Success Rate:", dribble_success_rate)
# print("Penalty Conversion Rate:", penalty_conversion_rate)
# print("Tackles:", tackles)
# print("Interceptions:", interception)
# print("Market Value:", market_value)
# print("Titles Won:", titles)


# Close the browser
browser.quit()


InvalidArgumentException: Message: invalid argument: 'url' must be a string
  (Session info: chrome=129.0.6668.90)
Stacktrace:
	GetHandleVerifier [0x00007FF658D4B095+29557]
	(No symbol) [0x00007FF658CBFA50]
	(No symbol) [0x00007FF658B7B56A]
	(No symbol) [0x00007FF658C19404]
	(No symbol) [0x00007FF658BF718A]
	(No symbol) [0x00007FF658C1851C]
	(No symbol) [0x00007FF658BF6F33]
	(No symbol) [0x00007FF658BC116F]
	(No symbol) [0x00007FF658BC22D1]
	GetHandleVerifier [0x00007FF65907C96D+3378253]
	GetHandleVerifier [0x00007FF6590C8497+3688311]
	GetHandleVerifier [0x00007FF6590BD1CB+3642539]
	GetHandleVerifier [0x00007FF658E0A6B6+813462]
	(No symbol) [0x00007FF658CCAB5F]
	(No symbol) [0x00007FF658CC6B74]
	(No symbol) [0x00007FF658CC6D10]
	(No symbol) [0x00007FF658CB5C1F]
	BaseThreadInitThunk [0x00007FFE2B7F7374+20]
	RtlUserThreadStart [0x00007FFE2BE1CC91+33]


In [41]:
print(players[0].name)
print(len(players))
# print(all_players_link[1])
# def count_diff_link(links):
#     count = 0
#     for link in links:
#         if(type(link) != type("str")):
#             count = count + 1
#             continue
#         if ('https://footystats.org/players'not in link):
#             count = count + 1
#     return count

# print(count_diff_link(all_players_link))

Robin Hranac
437


In [None]:
# quit the browser

browser.quit()