In [1]:
#from selenium import webdriver
#from selenium.webdriver.chrome.service import Service
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from bs4 import BeautifulSoup
import lightgbm as lgb
import pandas as pd
import numpy as np
import requests
import re

In [2]:
sporza_races = [
    "omloop-het-nieuwsblad",
    "kuurne-brussel-kuurne",
    "gp-samyn",
    "strade-bianche",
    "nokere-koers",
    "bredene-koksijde-classic",
    "milano-sanremo",
    "oxyclean-classic-brugge-de-panne",
    "e3-harelbeke",
    "gent-wevelgem",
    "dwars-door-vlaanderen",
    "ronde-van-vlaanderen",
    "scheldeprijs",
    "paris-roubaix",
    "brabantse-pijl",
    "amstel-gold-race",
    "la-fleche-wallone",
    "liege-bastogne-liege"
]

Let's get the top x riders from https://www.procyclingstats.com/rankings.php and create a dictionary to store their respective names (as the procyclingstats identifier of the form first_name-family_name), teams and PCS points.

In [3]:
def get_riders_teams_PCSpoints(n_riders, rider_data):
    for i in range (n_riders//100):
        PCS_ranking_url = f"""https://www.procyclingstats.com/rankings.php?
                          nation=&age=&zage=&page=smallerorequal&team=&offset={100*i}
                          &teamlevel=&filter=Filter"""
        response = requests.get(PCS_ranking_url)
        soup = BeautifulSoup(response.content, "html.parser")
        ranking_table = soup.find("table", class_="basic")
        for row in ranking_table.find_all("tr")[1:]:
            links = row.find_all("a", href=True)
            rider = links[0].get("href").split("/")[-1]
            team = links[1].get("href").split("/")[-1]
            points = links[2].text        
            rider_data[rider] = {"Team": team, "PCSpoints": points}

Let's use the rider identifiers to retrieve their age and weight.

In [4]:
def get_age_weight(rider_data):
    for rider in rider_data.keys():
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        fields = info.find_all("b")    
        rider_data[rider]["Age"] = re.search(r'\((.*?)\)', info.text).group(1)
        for field in fields:
            label = field.get_text(strip=True)
            if label not in ["", "Date of birth:", "Nationality:", "Place of birth:"]:
                value = field.next_sibling.strip()
                rider_data[rider][label] = value

In [5]:
def get_sporza_results(years, rider_data, DN_values):
    count = 0
    for race in sporza_races:
        for year in years:
            race_name = f"{race}/{year}"
            race_url = f"https://www.procyclingstats.com/race/{race_name}/result"
            try:
                response = requests.get(race_url)
                count += 1
            except:
                print(f"Page {race_url} didn't respond.")
            soup = BeautifulSoup(response.content, "html.parser")
            for row in soup.find("tbody").find_all("tr"):
                rider = row.find("input", class_="gotoH2H").get("data-seo")
                if rider in rider_data.keys():
                    fields = row.find_all("td")
                    placement = fields[0].get_text(strip=True)
                    if placement == "DNS":
                        placement = DN_values[0]
                    if placement == "DNF":
                        placement = DN_values[1]
                    if placement == "OTL":
                        placement = DN_values[1]
                    if placement == "DSQ":
                        placement = DN_values[2]
                    UCI_points = fields[7].get_text(strip=True)
                    ptn = fields[8].get_text(strip=True)
                    rider_data[rider][race_name] = int(placement)
                    if UCI_points != "":
                        if "UCI_points" in rider_data.get(rider, {}):
                            rider_data[rider]["UCI_points"] += int(UCI_points)
                        else:
                             rider_data[rider][f"UCI_points"] = int(UCI_points)
                    if ptn != "":
                        if f"ptn/{year}" in rider_data.get(rider, {}):
                            rider_data[rider][f"ptn/{year}"] += int(ptn)
                        else:
                             rider_data[rider][f"ptn/{year}"] = int(ptn)
    print(f"""Successfully retrieved the results of
          {count}/{len(sporza_races)*len(years)} races.""")

Preprocessing of the retrieved data includes:
* Creating a dataframe of the retrieved data
* Resetting the index so the rider's names are in a separate column
* Renaming some columns
* Converting the team name using a one hot encoding
* Converting PCSpoints and Age from string to int and cut
* Converting the Weight and Heights strings to int and float respectively as well. If a rider has not gained points in a certain season, let's set it to zero instead of NaN.
* Converting all race results to integers. x for DNS, y for DNF/OTL and z for DSQ.

In [6]:
def preprocess_rider_data(years, rider_data):
    rider_data = pd.DataFrame(rider_data).T
    #rider_data.reset_index(inplace=True)
    rider_data.rename(columns={"index": "Name", "Weight:": "Weight", "Height:": "Height"}, inplace=True)
    rider_data = pd.get_dummies(rider_data, columns=["Team"])
    rider_data["PCSpoints"] = rider_data["PCSpoints"].astype(int)
    rider_data["Age"] = rider_data["Age"].astype(int)
    rider_data["Weight"] = rider_data["Weight"].apply(
        lambda x: int(x[:2]) if isinstance(x, str) else x)
    rider_data["Height"] = rider_data["Height"].apply(
        lambda x: float(x.split()[0]) if isinstance(x, str) else x)
    for year in years:
        rider_data[f"ptn/{year}"].fillna(0, inplace=True)
    return rider_data

In [7]:
n_riders = 100
DN_values = [100, 200, 300] # Placement if a rider DNS, DNF/OTL and DSQ respectively
#years = [2018, 2019, 2021, 2022, 2023]
years = [2018, 2019]
rider_data = {}
get_riders_teams_PCSpoints(n_riders, rider_data)
get_age_weight(rider_data)
get_sporza_results(years, rider_data, DN_values)
rider_data = preprocess_rider_data(years, rider_data)

Successfully retrieved the results of
          36/36 races.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rider_data[f"ptn/{year}"].fillna(0, inplace=True)
  rider_data[f"ptn/{year}"].fillna(0, inplace=True)


The data is ready to be formatted for the model now. For each race, all riders that participated in that race will form a group together and the inverse of their placement will serve as a relevance score.

In [8]:
q_rider_data = pd.DataFrame()
for race in sporza_races[:2]:
    for year in years:
        race_name = f"{race}/{year}"
        participating_riders = rider_data[rider_data[race_name].notna()].copy()
        # The next line will set the race results as the objective for the group. The original results
        # will be set to NaN so they don't are exactly the same as the objective.
        participating_riders.rename(columns={race_name: "Objective"}, inplace=True)
        participating_riders["ID"] = race_name
        q_rider_data = pd.concat([q_rider_data, participating_riders], ignore_index=True)
q_rider_data = q_rider_data[["ID", "Objective"] + [col for col in q_rider_data.columns if col not in ["ID", "Objective"]]]
q_rider_data.fillna(100, inplace=True)
q_rider_data["Objective"] = q_rider_data["Objective"].astype(int)

  q_rider_data.fillna(100, inplace=True)


Each ID now represents a race. All riders that have competed in that race have a copy of their stats with this ID and their result in that race set as the objective. Their original result column is set to 100 because otherwise it would be the same as the objective.

The train-test split.

In [9]:
training_percentage = 0.8
cut_off = round(training_percentage*q_rider_data.shape[0])
train_df = q_rider_data[:cut_off]  # first 80%
validation_df = q_rider_data[cut_off:]  # remaining 20%
q_train = train_df.groupby("ID")["ID"].count().to_numpy()
X_train = train_df.drop(["ID", "Objective"], axis=1)
y_train = train_df["Objective"]

q_validation = validation_df.groupby("ID")["ID"].count().to_numpy()
X_validation = validation_df.drop(["ID", "Objective"], axis=1)
y_validation = validation_df["Objective"]

In [10]:
q_rider_data.shape

(53, 67)

Model training.

In [11]:
#model = lgb.LGBMRanker(
#    objective="lambdarank",
#    metric="ndcg",
#)
#
#model.fit(
#    X=X_train,
#    y=y_train,
#    group=q_train,
#    eval_set=[(X_validation, y_validation)],
#    eval_group=[q_validation],
#    eval_at=10,
#)