In [1]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.18.1-py3-none-any.whl.metadata (6.9 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.24.0-py3-none-any.whl.metadata (4.9 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.18.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.24.0-py3-none-any.whl (460 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.2/460.2 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Downloading wsproto-1.

In [2]:
#from selenium import webdriver
#from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re

In [3]:
sporza_races = [
    "omloop-het-nieuwsblad",
    "kuurne-brussel-kuurne",
    "gp-samyn",
    "strade-bianche",
    "nokere-koers",
    "bredene-koksijde-classic",
    "milano-sanremo",
    "oxyclean-classic-brugge-de-panne",
    "e3-harelbeke",
    "gent-wevelgem",
    "dwars-door-vlaanderen",
    "ronde-van-vlaanderen",
    "scheldeprijs",
    "paris-roubaix",
    "brabantse-pijl",
    "amstel-gold-race",
    "la-fleche-wallone",
    "liege-bastogne-liege"
]

Let's get the top x riders from https://www.procyclingstats.com/rankings.php and create a dictionary to store their respective names (as the procyclingstats identifier of the form first_name-family_name), teams and PCS points.

In [4]:
def get_riders_teams_PCSpoints(n_riders, rider_data):
    for i in range (n_riders//100):
        PCS_ranking_url = f"""https://www.procyclingstats.com/rankings.php?
                          nation=&age=&zage=&page=smallerorequal&team=&offset={100*i}
                          &teamlevel=&filter=Filter"""
        response = requests.get(PCS_ranking_url)
        soup = BeautifulSoup(response.content, "html.parser")
        ranking_table = soup.find("table", class_="basic")
        for row in ranking_table.find_all("tr")[1:]:
            links = row.find_all("a", href=True)
            rider = links[0].get("href").split("/")[-1]
            team = links[1].get("href").split("/")[-1]
            points = links[2].text        
            rider_data[rider] = {"Team": team, "PCSpoints": points}

Let's use the rider identifiers to retrieve their age and weight.

In [5]:
def get_age_weight(rider_data):
    for rider in rider_data.keys():
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        fields = info.find_all("b")    
        rider_data[rider]["age"] = re.search(r'\((.*?)\)', info.text).group(1)
        for field in fields:
            label = field.get_text(strip=True)
            if label not in ["", "Date of birth:", "Nationality:", "Place of birth:"]:
                value = field.next_sibling.strip()
                rider_data[rider][label] = value

In [6]:
def get_sporza_results(years, rider_data):
    for race in sporza_races:
        for year in years:
            race_name = f"{race}/{year}"
            race_url = f"https://www.procyclingstats.com/race/{race_name}/result"
            print(f"Processing the results of {race_name}")
            try:
                response = requests.get(race_url)
            except:
                print(f"Page {race_url} didn't respond.")
            soup = BeautifulSoup(response.content, "html.parser")
            for row in soup.find("tbody").find_all("tr"):
                rider = row.find("input", class_="gotoH2H").get("data-seo")
                if rider in rider_data.keys():
                    fields = row.find_all("td")
                    placement = fields[0].get_text(strip=True)
                    UCI_points = fields[7].get_text(strip=True)
                    ptn = fields[8].get_text(strip=True)

                    rider_data[rider][race_name] = placement
                    if UCI_points != "":
                        if "UCI_points" in rider_data.get(rider, {}):
                            rider_data[rider]["UCI_points"] += int(UCI_points)
                        else:
                             rider_data[rider][f"UCI_points"] = int(UCI_points)
                    if ptn != "":
                        if f"ptn/{year}" in rider_data.get(rider, {}):
                            rider_data[rider][f"ptn/{year}"] += int(ptn)
                        else:
                             rider_data[rider][f"ptn/{year}"] = int(ptn)     

In [7]:
n_riders = 100
years = [2019, 2019, 2021, 2022, 2023]
rider_data = {}
get_riders_teams_PCSpoints(n_riders, rider_data)
get_age_weight(rider_data)
get_sporza_results(years, rider_data)

Processing the results of omloop-het-nieuwsblad/2019
Processing the results of omloop-het-nieuwsblad/2019
Processing the results of omloop-het-nieuwsblad/2021
Processing the results of omloop-het-nieuwsblad/2022
Processing the results of omloop-het-nieuwsblad/2023
Processing the results of kuurne-brussel-kuurne/2019
Processing the results of kuurne-brussel-kuurne/2019
Processing the results of kuurne-brussel-kuurne/2021
Processing the results of kuurne-brussel-kuurne/2022
Processing the results of kuurne-brussel-kuurne/2023
Processing the results of gp-samyn/2019
Processing the results of gp-samyn/2019
Processing the results of gp-samyn/2021
Processing the results of gp-samyn/2022
Processing the results of gp-samyn/2023
Processing the results of strade-bianche/2019
Processing the results of strade-bianche/2019
Processing the results of strade-bianche/2021
Processing the results of strade-bianche/2022
Processing the results of strade-bianche/2023
Processing the results of nokere-koers/2

In [8]:
rider_data = pd.DataFrame(rider_data).T
rider_data.describe()

Unnamed: 0,Team,PCSpoints,age,Weight:,Height:,nokere-koers/2019,bredene-koksijde-classic/2019,brabantse-pijl/2022,UCI_points,ptn/2022,...,brabantse-pijl/2023,oxyclean-classic-brugge-de-panne/2022,kuurne-brussel-kuurne/2022,gp-samyn/2022,nokere-koers/2022,bredene-koksijde-classic/2022,gp-samyn/2019,gp-samyn/2023,nokere-koers/2023,bredene-koksijde-classic/2023
count,100,100,100,100,100,5,3,17,90,75,...,16,15,23,9,9,6,1,5,8,5
unique,23,94,18,31,26,4,3,13,88,58,...,15,15,21,9,9,6,1,5,8,5
top,uae-team-emirates-2024,803,25,65 kg,1.83 m,DNF,46,DNF,5,5,...,DNF,5,DNF,10,3,9,DNF,58,101,1
freq,11,2,12,10,9,2,1,5,3,9,...,2,1,3,1,1,1,1,1,1,1


In [9]:
len(rider_data.isnull().sum()[rider_data.isnull().sum() == 0])

5

In [10]:
rider_data.isna().sum()

Team                              0
PCSpoints                         0
age                               0
Weight:                           0
Height:                           0
                                 ..
bredene-koksijde-classic/2022    94
gp-samyn/2019                    99
gp-samyn/2023                    95
nokere-koers/2023                92
bredene-koksijde-classic/2023    95
Length: 82, dtype: int64