In [26]:
# import necessary libraries

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [27]:
# setup browser and url

browser = webdriver.Chrome()

url = 'https://footystats.org'

In [28]:
# open website

browser.get(url)
browser.maximize_window()

# wait until dropdown menu is loaded
WebDriverWait(browser, 20).until(
    EC.presence_of_element_located((By.XPATH, "//*[text()='Leagues']"))
)

<selenium.webdriver.remote.webelement.WebElement (session="30d948f73fc8120a82445bc3f52844da", element="f.655CBACBD3D2C89895A7E34714A78E0D.d.9D51B9C564ED67096D0F2777EDA91EB9.e.116")>

In [29]:
# grab links to chosen leagues from dropdown menu

leagues = ['Premier League', 'La Liga', 'Ligue 1', 'Bundesliga', 'Serie A']

# hover over 'Leagues' dropdown menu
leagues_dropdown = browser.find_element(By.XPATH, "//*[text()='Leagues']")
webdriver.ActionChains(browser).move_to_element(leagues_dropdown).perform()

# find and extract links to chosen leagues from dropdown menu
leagues_links = []
for league in leagues:
    league_link = browser.find_element(By.XPATH, f"//*[text()='{league}']")
    leagues_links.append(league_link.get_attribute('href'))
    
# print out for debugging
for league, link in zip(leagues, leagues_links):
    print(league, link)

Premier League https://footystats.org/england/premier-league
La Liga https://footystats.org/spain/la-liga
Ligue 1 https://footystats.org/france/ligue-1
Bundesliga https://footystats.org/germany/bundesliga
Serie A https://footystats.org/italy/serie-a


### Take all links of clubs from league links

In [30]:
import numpy as np

all_club_links = []

for link in leagues_links:
    browser.get(link)
    WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "table-wrapper")))
    club_elements = browser.find_elements(By.XPATH, "//table//tr//td//a[contains(@href, '/clubs/')]")
    club_links = np.array([club_link.get_attribute('href') for club_link in club_elements])
    all_club_links.extend(club_links)

all_club_links = np.array(list(set(all_club_links)))

print(len(all_club_links))

for link in all_club_links:
    print(link)

96
https://footystats.org/clubs/acf-fiorentina-471
https://footystats.org/clubs/ca-osasuna-287
https://footystats.org/clubs/crystal-palace-fc-143
https://footystats.org/clubs/ud-las-palmas-276
https://footystats.org/clubs/deportivo-alaves-277
https://footystats.org/clubs/paris-saint-germain-fc-68
https://footystats.org/clubs/real-club-deportivo-mallorca-298
https://footystats.org/clubs/rasen-ballsport-leipzig-46
https://footystats.org/clubs/arsenal-fc-59
https://footystats.org/clubs/real-sociedad-de-futbol-290
https://footystats.org/clubs/rc-strasbourg-alsace-480
https://footystats.org/clubs/chelsea-fc-152
https://footystats.org/clubs/as-roma-113
https://footystats.org/clubs/fc-nantes-440
https://footystats.org/clubs/lille-osc-metropole-441
https://footystats.org/clubs/stade-de-reims-451
https://footystats.org/clubs/ipswich-town-fc-220
https://footystats.org/clubs/real-betis-balompie-284
https://footystats.org/clubs/brentford-fc-218
https://footystats.org/clubs/athletic-club-bilbao-285

### Take all links of players of 5 leagues

In [31]:
all_players_link = []

for link in all_club_links:
    browser.get(link)
    WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//*[text()='Players']")))
    browser.find_element(By.XPATH, "//*[text()='Players']").click()
    players_elements = browser.find_elements(By.CLASS_NAME, "semi-bold")
    players_links = np.array([player_link.get_attribute('href') for player_link in players_elements])
    all_players_link.extend(players_links)

all_players_link = np.array(list(set(all_players_link)))

In [32]:
dummy_link = []
filtered_list = list(filter(lambda x: x is not None, all_players_link))

for link in filtered_list:
    if "https://footystats.org/players" not in link or link == "https://footystats.org/players//":
        dummy_link.append(link)

print(len(dummy_link))

all_players_link_cleaned = np.setdiff1d(filtered_list, dummy_link)

all_players_link = all_players_link_cleaned

print(len(all_players_link_cleaned))
for link in all_players_link:
    print(link)


446
2164
https://footystats.org/players/albania/ardian-ismajli
https://footystats.org/players/albania/berat-djimsiti
https://footystats.org/players/albania/elseid-hysaj
https://footystats.org/players/albania/ivan-balliu
https://footystats.org/players/albania/kristjan-asllani
https://footystats.org/players/albania/marash-kumbulla
https://footystats.org/players/albania/ylber-ramadani
https://footystats.org/players/algeria/abderrahman-rebbach
https://footystats.org/players/algeria/aissa-mandi
https://footystats.org/players/algeria/farid-el-melali
https://footystats.org/players/algeria/haris-belkebla
https://footystats.org/players/algeria/hicham-boudaoui
https://footystats.org/players/algeria/ismael-bennacer
https://footystats.org/players/algeria/mohamed-amoura
https://footystats.org/players/algeria/ramy-bensebaini
https://footystats.org/players/algeria/said-benrahma
https://footystats.org/players/algeria/zinedine-ferhat
https://footystats.org/players/angola/clinton-mata
https://footystats

In [43]:
import re
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
import csv
# 
class Player:
    def __init__(self):
        pass
    
    def set_stats(self, name, age, nationality, club, position, height, weight, foot, total_matches, total_goals,
              total_minutes, total_assists, total_yellow, total_red, minutes_per_goal, shot_accuracy,
              pass_completion_rate, cross_completion_rate, minutes_per_assist, dribble_success_rate,
              penalty_conversion_rate, tackles, interception, market_value, titles, injuries):
        
        # Basic information
        self.name = name
        self.age = age
        self.nationality = nationality
        self.club = club
        self.position = position
        self.height = height
        self.weight = weight
        self.foot = foot
        
        # Match statistics
        self.total_matches = total_matches
        self.total_goals = total_goals
        self.total_minutes = total_minutes
        self.total_assists = total_assists
        self.total_yellow = total_yellow
        self.total_red = total_red
        
        # Performance metrics
        self.minutes_per_goal = minutes_per_goal
        self.shot_accuracy = shot_accuracy
        self.pass_completion_rate = pass_completion_rate
        self.cross_completion_rate = cross_completion_rate
        self.minutes_per_assist = minutes_per_assist
        self.dribble_success_rate = dribble_success_rate
        self.penalty_conversion_rate = penalty_conversion_rate
        
        # Defensive stats
        self.tackles = tackles
        self.interception = interception
        
        # Market value and titles
        self.market_value = market_value
        self.titles = titles
        self.injuries = injuries

def get_stat_number_int(stat):
    stat_number = int(re.findall(r'\d+', stat)[0])
    return stat_number

def get_stat_number_float(stat):
    stat_number_match = re.findall(r'\d+', stat)
    if (len(stat_number_match) == 2):
        stat_number = float(stat_number_match[0] + '.' + stat_number_match[1])
    elif (len(stat_number_match) == 1):
        stat_number = float(stat_number_match[0])
    else: 
        stat_number = None
    
    return stat_number
def extract_player_info(browser,link):
    if (not isinstance(link, (str, np.str_))  or ('https://footystats.org/players'not in link)):
        return None
    # Open the page
    browser.get(link)
    
    # Wait until the div with id 'h2h_content3' is present
    try:
        content_div = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.ID, "h2h_content3"))
        )
    except TimeoutException:
        return None
    # Locate the div with id 'h2h_content1'
    # Get the information from h2hcontent1
    content_div = browser.find_element(By.ID, "h2h_content1")
    name = (content_div.find_element(By.XPATH, './/div[@class="w100 row cf"]/h1[@class="row white heroh1"]').text).split(" Stats")[0].strip() 
    try:
        age = (content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e pb05e mr15e"]/span[@class="semi-bold"]').text).split('(')[0].strip()
    except NoSuchElementException:
        age = None
    nationality = content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e ml15e"][1]/span[@class="semi-bold"]').text
    club = content_div.find_element(By.XPATH, './/div[@class="fl rw50 white fs08e"]/a').text
    position = content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e"][1]/span[@class="semi-bold"]').text
    try:
        height = get_stat_number_int(content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e mr15e"][1]/span[@class="semi-bold"]').text)
    except NoSuchElementException:
        height = None
    try:
        weight = get_stat_number_int(content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e mr15e"][2]/span[@class="semi-bold"]').text)
    except NoSuchElementException:
        weight = None
    try:
        foot = content_div.find_element(By.XPATH, './/div[@class="fl rw33 white fs08e ml15e"][2]/span[@class="semi-bold"]').text
        if foot not in ['Right','Left','Both']:
            foot = None
    except NoSuchElementException:
        foot = None

    # Get the information from h2hcontent2
    content_div = browser.find_element(By.ID, "h2h_content2")
    total_matches = content_div.find_element(By.XPATH, './/td[contains(text(), "Matches Played")]/following-sibling::td[1]').text
    total_minutes = content_div.find_element(By.XPATH, './/td[contains(text(), "Minutes")]/following-sibling::td[1]').text
    try:
        total_goals = content_div.find_element(By.XPATH, './/td[contains(text(), "Goals Scored")]/following-sibling::td[1]').text
    except NoSuchElementException:
        total_goals = None
    try:
        total_assists = content_div.find_element(By.XPATH, './/td[contains(text(), "Assists")]/following-sibling::td[1]').text
    except NoSuchElementException:
        total_assists = None
    try:
        total_yellow = content_div.find_element(By.XPATH, './/td[contains(text(), " Yellow Cards")]/following-sibling::td[1]').text
    except:
        total_yellow = None
    try:
        total_red = content_div.find_element(By.XPATH, './/td[contains(text(), " Red Cards")]/following-sibling::td[1]').text
    except NoSuchElementException:
        total_red = None
    try:
        minutes_per_goal = get_stat_number_int(content_div.find_element(By.XPATH, './/td[contains(text(), "Minutes Per Goal")]/following-sibling::td[1]').text)
    except NoSuchElementException:
        minutes_per_goal = None
    try:
        shot_accuracy = get_stat_number_float(content_div.find_element(By.XPATH, './/td[contains(text(), "Shot Accuracy")]/following-sibling::td[1]').text)
    except NoSuchElementException:
        shot_accuracy = None
    try:
        pass_completion_rate = get_stat_number_float(content_div.find_element(By.XPATH, './/td[contains(text(), "Pass Completion Rate")]/following-sibling::td[1]').text)
    except NoSuchElementException:
        pass_completion_rate = None
    try:
        cross_completion_rate = get_stat_number_float(content_div.find_element(By.XPATH, './/td[contains(text(), "Cross Completion Rate")]/following-sibling::td[1]').text)
    except NoSuchElementException:
        cross_completion_rate = None
    try:
        minutes_per_assist = content_div.find_element(By.XPATH, './/td[contains(text(), "Minutes Per Assist")]/following-sibling::td[1]').text
        minutes_per_assist_match = re.findall(r'\d+', minutes_per_assist)
        if (minutes_per_assist_match):
            minutes_per_assist = get_stat_number_int(minutes_per_assist)
        else:
            minutes_per_assist = None
    except NoSuchElementException:
        minutes_per_assist = None
    try:
        dribble_success_rate = content_div.find_element(By.XPATH, './/td[contains(text(), "Dribble Success Rate")]/following-sibling::td[1]').text
        dribble_success_rate_match = re.findall(r'\d+', dribble_success_rate)
        if(dribble_success_rate_match):
            dribble_success_rate = get_stat_number_float(dribble_success_rate)
        else:
            dribble_success_rate = None
    except NoSuchElementException:
        dribble_success_rate = None
    try:
        penalty_conversion_rate = content_div.find_element(By.XPATH, './/td[contains(text(), "Penalty Conversion Rate")]/following-sibling::td[1]').text
        penalty_conversion_rate_match = re.findall(r'\d+', penalty_conversion_rate)
        if (penalty_conversion_rate_match):
            penalty_conversion_rate = get_stat_number_float(penalty_conversion_rate)
        else:
            penalty_conversion_rate = None
    except NoSuchElementException:
        penalty_conversion_rate = None
    try:
        tackles = content_div.find_element(By.XPATH, './/td[contains(text(), "Tackles")]/following-sibling::td[1]').text
    except NoSuchElementException:
        tackles = None
    try:
        interception = content_div.find_element(By.XPATH, './/td[contains(text(), "Interceptions")]/following-sibling::td[1]').text
    except NoSuchElementException:
        interception = None

    # Get the information from h2hcontent3
    content_div = browser.find_element(By.ID, "h2h_content3")
    try:
        market_value_section = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="title-bar"]'))
            )
    except TimeoutException:
        return None
    try:
        value_element = content_div.find_element(By.XPATH, './/h3').text
        value_element = value_element.replace('€', '').replace(' ', '')
        if 'k' in value_element:
            market_value = float(value_element.replace('k', '')) / 1000  # convert thousands to millions
        else:
            market_value = get_stat_number_float(value_element)
    except NoSuchElementException:
        market_value = None
    try:
        titles = get_stat_number_int(content_div.find_element(By.XPATH, './/p[@class="mt1e lh14e header-blue semi-bold fs09e"]').text)
    except NoSuchElementException:
        titles = None
    try:
        injury_element = content_div.find_element(By.XPATH, './/section [@class="cf stat-group stat-box pb1e stat-group-init"][2]/div[@class ="w94 m0Auto cf"]')
        injuries = len(injury_element.find_elements(By.TAG_NAME, 'p'))
    except NoSuchElementException:
        injuries = 0
    player = Player()
    player.set_stats(
        name,
        age,
        nationality,
        club,
        position,
        height,
        weight,
        foot,
        total_matches,
        total_goals,
        total_minutes,
        total_assists,
        total_yellow,
        total_red,
        minutes_per_goal,
        shot_accuracy,
        pass_completion_rate,
        cross_completion_rate,
        minutes_per_assist,
        dribble_success_rate,
        penalty_conversion_rate,
        tackles,
        interception,
        market_value,
        titles,
        injuries
    )
    return player

In [34]:
# Set up the Selenium WebDriver (make sure you have installed the required driver for your browser, e.g., ChromeDriver)
browser = webdriver.Chrome()
players = []
error_links = []
for link in all_players_link:
    player = extract_player_info(browser,link)
    if (player != None):
        players.append(player)
    else:
        error_links.append(link)
        continue
print(len(players))
for error in error_links:
    print(error)
# print("Name:", name)
# print("Age:", age)
# print("Nationality:", nationality)
# print("Club:", club)
# print("Position:", position)
# print("Height:", height)
# print("Weight:", weight)
# print("Foot:", foot)
# print("Total Matches Played:", total_matches)
# print("Total Minutes:", total_minutes)
# print("Total Goals Scored:", total_goals)
# print("Total Assists:", total_assists)
# print("Total Yellow Cards:", total_yellow)
# print("Total Red Cards:", total_red)
# print("Minutes Per Goal:", minutes_per_goal)
# print("Shot Accuracy:", shot_accuracy)
# print("Pass Completion Rate:", pass_completion_rate)
# print("Cross Completion Rate:", cross_completion_rate)
# print("Minutes Per Assist:", minutes_per_assist)
# print("Dribble Success Rate:", dribble_success_rate)
# print("Penalty Conversion Rate:", penalty_conversion_rate)
# print("Tackles:", tackles)
# print("Interceptions:", interception)
# print("Market Value:", market_value)
# print("Titles Won:", titles)


# Close the browser
browser.quit()


2037
https://footystats.org/players/argentina/maximo-perrone
https://footystats.org/players/cameroon/andre-onana
https://footystats.org/players/england/flavien-enzo-boyomos
https://footystats.org/players/england/roman-quintyne
https://footystats.org/players/england/will-osula
https://footystats.org/players/france/eric-junior-dina-ebimbe
https://footystats.org/players/france/iliesse-salhi
https://footystats.org/players/france/issiaka-toure
https://footystats.org/players/france/lucien-agoume
https://footystats.org/players/france/pierre-lees-melou
https://footystats.org/players/france/skelly-alvero
https://footystats.org/players/france/yoram-zague
https://footystats.org/players/georgia/saba-goglichidze
https://footystats.org/players/germany/matteo-palma
https://footystats.org/players/italy/honest-ahanor
https://footystats.org/players/italy/tommaso-augello
https://footystats.org/players/ivory-coast/abakar-sylla
https://footystats.org/players/ivory-coast/amad-diallo-traore
https://footystat

In [40]:
def to_dict(player):
    return player.__dict__

with open("players.csv", 'w',encoding='utf-8') as csvfile:
    csvwriter = csv.DictWriter(csvfile, fieldnames=[
    "name", "age", "nationality", "club", "position", "height", "weight", "foot", 
    "total_matches", "total_goals", "total_minutes", "total_assists", "total_yellow", 
    "total_red", "minutes_per_goal", "shot_accuracy", "pass_completion_rate", 
    "cross_completion_rate", "minutes_per_assist", "dribble_success_rate", 
    "penalty_conversion_rate", "tackles", "interception", "market_value", "titles", "injuries"
    ])
    csvwriter.writeheader()
    for player in players:
        csvwriter.writerow(to_dict(player))


In [None]:
# quit the browser

browser.quit()

In [45]:
browser = webdriver.Chrome()
link = 'https://footystats.org/players/albania/Ylber-Ramadani'
test = extract_player_info(browser,link)
print(test.foot)
browser.quit()

None
