In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from multiprocessing import Pool
from bs4 import BeautifulSoup
import lightgbm as lgb
import pandas as pd
import numpy as np
import requests
import time
import re

In [2]:
sporza_races = [
    "omloop-het-nieuwsblad",
    "kuurne-brussel-kuurne",
    "gp-samyn",
    "strade-bianche",
    "nokere-koers",
    "bredene-koksijde-classic",
    "milano-sanremo",
    "oxyclean-classic-brugge-de-panne",
    "e3-harelbeke",
    "gent-wevelgem",
    "dwars-door-vlaanderen",
    "ronde-van-vlaanderen",
    "scheldeprijs",
    "paris-roubaix",
    "brabantse-pijl",
    "amstel-gold-race",
    "la-fleche-wallone",
    "liege-bastogne-liege"
]

Let's get the list of riders that start in each of the races this year. These will be fed to the model to be ranked later.

In [3]:
start_info = {}
for race in sporza_races:
    startlist = []
    race_link = f"https://www.procyclingstats.com/race/{race}/2024/startlist/startlist-quality"
    response = requests.get(race_link)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="basic")
    for rider in table.find_all("tr")[1:]:
        try:
            startlist.append(rider.find("a")["href"].split('/')[1])
        except:
            break
    start_info[race] = startlist

In [4]:
from collections import Counter

all_names = [name for names in start_info.values() for name in names]
name_counts = Counter(all_names)
name_counts = sorted(name_counts.items(), key=lambda x: x[1], reverse=True)
print("Riders with the most participations:")
for rank, (name, count) in enumerate(name_counts, 1):
    if count >= 9:
        print(f"{rank}. {name}: {count}")
        
num_unique_names = len(set(name for names in start_info.values() for name in names))
print("")
print(f"There are {num_unique_names}/945 Sporza riders participating so far.")

Riders with the most participations:
1. matej-mohoric: 11
2. edward-theuns: 11
3. jasper-philipsen: 10
4. arnaud-de-lie: 10
5. riley-sheehan: 10
6. matevz-govekar: 10
7. christophe-laporte: 9
8. tim-wellens: 9
9. jordi-meeus: 9
10. jasper-stuyven: 9
11. antonio-morgado: 9
12. sander-de-pestel: 9
13. soren-kragh-andersen: 9
14. nils-eekhoff: 9
15. marc-hirschi: 9

There are 463/945 Sporza riders participating so far.


Let's get the top x riders from https://www.procyclingstats.com/rankings.php and create a dictionary to store their respective names (as the procyclingstats identifier of the form first_name-family_name), teams and PCS points.

In [5]:
def get_riders_teams_PCSpoints(n_riders, rider_data):
    for i in range (n_riders//100):
        PCS_ranking_url = f"""https://www.procyclingstats.com/rankings.php?
                          nation=&age=&zage=&page=smallerorequal&team=&offset={100*i}
                          &teamlevel=&filter=Filter"""
        response = requests.get(PCS_ranking_url)
        soup = BeautifulSoup(response.content, "html.parser")
        ranking_table = soup.find("table", class_="basic")
        for row in ranking_table.find_all("tr")[1:]:
            links = row.find_all("a", href=True)
            rider = links[0].get("href").split("/")[-1]
            team = links[1].get("href").split("/")[-1]
            points = links[2].text        
            rider_data[rider] = {"Team": team, "PCSpoints": points}

Let's use the rider identifiers to retrieve their age and weight.

In [6]:
def get_age_weight(rider_data):
    for rider in rider_data.keys():
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        fields = info.find_all("b")
        try:
            rider_data[rider]["Age"] = re.search(r'\((.*?)\)', info.text).group(1)
        except(AttributeError):
            rider_data[rider]["Age"] = 27
        for field in fields:
            label = field.get_text(strip=True)
            if label not in ["", "Date of birth:", "Nationality:", "Place of birth:"]:
                value = field.next_sibling.strip()
                rider_data[rider][label] = value

Trying to parallellise the get_age_weight function because it consists of seuentially retrieving the same information of all rider profiles.

In [7]:
def get_age_weight_parallel(rider_data):
    riders = list(rider_data.keys())
    with Pool() as pool:
        results = pool.map(get_age_weight_single_rider, riders)
        for result in results:
            rider = result["rider"]
            del result["rider"]
            rider_data[rider].update(result)
    
def get_age_weight_single_rider(rider):
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        print(info)
        nationality = info.find('a').text
        fields = info.find_all("b")
        try:
            age = re.search(r'\((.*?)\)', info.text).group(1)
        except(AttributeError):
            age = 27
        for field in fields:
            label = field.get_text(strip=True)
            weight = 0
            if label not in ["", "Date of birth:", "Nationality:" "Place of birth:"]:
                value = field.next_sibling.strip()
                print(f"{label} is {value}")
                weight = value

        return {"rider": rider, "age": age, "weight": weight, "nationality": nationality}

In [8]:
def get_sporza_results(years, rider_data, DN_values):
    count = 0
    for race in sporza_races:
        for year in years:
            race_name = f"{race}/{year}"
            race_url = f"https://www.procyclingstats.com/race/{race_name}/result"
            try:
                response = requests.get(race_url)
                count += 1
            except:
                print(f"Page {race_url} didn't respond.")
            soup = BeautifulSoup(response.content, "html.parser")
            for row in soup.find("tbody").find_all("tr"):
                rider = row.find("input", class_="gotoH2H").get("data-seo")
                if rider in rider_data.keys():
                    fields = row.find_all("td")
                    placement = fields[0].get_text(strip=True)
                    if placement == "DNS":
                        placement = DN_values[0]
                    if placement == "DNF":
                        placement = DN_values[1]
                    if placement == "OTL":
                        placement = DN_values[1]
                    if placement == "DSQ":
                        placement = DN_values[2]
                    UCI_points = fields[7].get_text(strip=True)
                    ptn = fields[8].get_text(strip=True)
                    rider_data[rider][race_name] = int(placement)
                    if UCI_points != "":
                        if "UCI_points" in rider_data.get(rider, {}):
                            rider_data[rider]["UCI_points"] += int(UCI_points)
                        else:
                             rider_data[rider][f"UCI_points"] = int(UCI_points)
                    if ptn != "":
                        if f"ptn/{year}" in rider_data.get(rider, {}):
                            rider_data[rider][f"ptn/{year}"] += int(ptn)
                        else:
                             rider_data[rider][f"ptn/{year}"] = int(ptn)
    print(f"""Successfully retrieved the results of
          {count}/{len(sporza_races)*len(years)} races.""")

Preprocessing of the retrieved data includes:
* Creating a dataframe of the retrieved data
* Resetting the index so the rider's names are in a separate column
* Renaming some columns
* Converting the team name using a one hot encoding
* Converting PCSpoints and Age from string to int and cut
* Converting the Weight and Heights strings to int and float respectively as well. If a rider has not gained points in a certain season, let's set it to zero instead of NaN.
* Setting missing Weight and Height values to the respective modal values
* Converting all race results to integers. x for DNS, y for DNF/OTL and z for DSQ.

In [9]:
def preprocess_rider_data(years, rider_data):
    rider_data = pd.DataFrame(rider_data).T
    #rider_data.reset_index(inplace=True)
    rider_data.rename(columns={"index": "Name", "Weight:": "Weight", "Height:": "Height"}, inplace=True)
    rider_data = pd.get_dummies(rider_data, columns=["Team"])
    rider_data["PCSpoints"] = rider_data["PCSpoints"].astype(int)
    rider_data["Age"] = rider_data["Age"].astype(int)
    rider_data["Weight"] = rider_data["Weight"].apply(
        lambda x: int(x[:2]) if isinstance(x, str) else x)
    rider_data["Weight"].fillna(rider_data["Weight"].mode()[0], inplace=True)
    rider_data["Weight"] = rider_data["Weight"].infer_objects(copy=False)
    rider_data["Height"] = rider_data["Height"].apply(
        lambda x: float(x.split()[0]) if isinstance(x, str) else x)
    rider_data["Height"].fillna(rider_data["Height"].mode()[0], inplace=True)
    rider_data["Height"] = rider_data["Height"].infer_objects(copy=False)
    for year in years:
        rider_data[f"ptn/{year}"] = rider_data[f"ptn/{year}"].fillna(0).infer_objects(copy=False)
    rider_data['UCI_points'] = rider_data['UCI_points'].fillna(0).infer_objects(copy=False)
    return rider_data

In [10]:
n_riders = 200
DN_values = [100, 200, 300] # Placement if a rider DNS, DNF/OTL and DSQ respectively
#years = [2018, 2019, 2021, 2022, 2023]
years = [2022, 2023]
rider_data = {}

print("Retrieving rider names, teams and number of PCS points.")
start_time = time.time()
get_riders_teams_PCSpoints(n_riders, rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving age and weight.")
start_time = time.time()
get_age_weight(rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving race results.")
start_time = time.time()
get_sporza_results(years, rider_data, DN_values)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Preprocessing data.")
rider_data = preprocess_rider_data(years, rider_data)

Retrieving rider names, teams and number of PCS points.
Information retrieved in 1.5301237106323242 seconds.
----------------------------------------------------------
Retrieving age and weight.
Information retrieved in 114.92902517318726 seconds.
----------------------------------------------------------
Retrieving race results.
Successfully retrieved the results of
          36/36 races.
Information retrieved in 29.029501914978027 seconds.
----------------------------------------------------------
Preprocessing data.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rider_data["Weight"].fillna(rider_data["Weight"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rider_data["Height"].fillna(rider_data["Height"].mode()[0], inplace=True)
  rider_data[f"ptn/{year}"] = rider_data[f"ptn/{year}"].fillna(0).infer_objects(copy

The data is ready to be formatted for the model now. For each race, all riders that participated in that race will form a group together and the inverse of their placement will serve as a relevance score.

In [11]:
q_rider_data = pd.DataFrame()
for race in sporza_races:
    for year in years:
        race_name = f"{race}/{year}"
        participating_riders = rider_data[rider_data[race_name].notna()].copy()
        # The next line will set the race results as the objective for the group. The original results
        # will be set to NaN so they don't are exactly the same as the objective.
        participating_riders["ID"] = race_name
        participating_riders.rename(columns={race_name: "Objective"}, inplace=True)
        q_rider_data = pd.concat([q_rider_data, participating_riders], ignore_index=True)
q_rider_data = q_rider_data[["ID", "Objective"] + [col for col in q_rider_data.columns if col not in ["ID", "Objective"]]]
q_rider_data.fillna(100, inplace=True)
q_rider_data["Objective"] = q_rider_data["Objective"].astype(int)
print(q_rider_data)

                              ID  Objective  PCSpoints  Age  Weight  Height  \
0     omloop-het-nieuwsblad/2022          1       2167   29    78.0    1.90   
1     omloop-het-nieuwsblad/2022         17       1314   29    72.0    1.86   
2     omloop-het-nieuwsblad/2022         75       1118   23    65.0    1.75   
3     omloop-het-nieuwsblad/2022         18       1035   24    58.0    1.70   
4     omloop-het-nieuwsblad/2022         56        927   24    62.0    1.76   
...                          ...        ...        ...  ...     ...     ...   
1351   liege-bastogne-liege/2023         21        378   26    63.0    1.77   
1352   liege-bastogne-liege/2023         39        360   25    64.0    1.79   
1353   liege-bastogne-liege/2023         92        357   23    67.0    1.81   
1354   liege-bastogne-liege/2023         37        348   26    61.0    1.75   
1355   liege-bastogne-liege/2023         25        321   34    67.0    1.73   

      brabantse-pijl/2022  UCI_points  ptn/2022  la

  q_rider_data.fillna(100, inplace=True)


Each ID now represents a race. All riders that have competed in that race have a copy of their stats with this ID and their result in that race set as the objective. Their original result column is set to 100 because otherwise it would be the same as the objective.

The train-test split.

In [12]:
training_percentage = 0.8
cut_off = round(training_percentage*q_rider_data.shape[0])
train_df = q_rider_data[:cut_off]  # first 80%
validation_df = q_rider_data[cut_off:]  # remaining 20%
q_train = train_df.groupby("ID")["ID"].count().to_numpy()
X_train = train_df.drop(["ID", "Objective"], axis=1)
y_train = train_df["Objective"]

q_validation = validation_df.groupby("ID")["ID"].count().to_numpy()
X_validation = validation_df.drop(["ID", "Objective"], axis=1)
y_validation = validation_df["Objective"]

Model training.

In [13]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    label_gain = [i for i in range(max(y_train.max(), y_validation.max()) + 1)]
)

model.fit(
    X=X_train,
    y=y_train,
    group=q_train,
    eval_set=[(X_validation, y_validation)],
    eval_group=[q_validation],
    eval_at=10,
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1502
[LightGBM] [Info] Number of data points in the train set: 1085, number of used features: 64


The model got n_races*n_years race rankings based on a subset of all the riders in the dataset. I think it learns a ranking method by using these n_races*n_years examples. I think for prediction, I have to give it a set of riders (get the starting list from procyclingstats) and then it will rank them, regardless of the race it is. So no real querying?

In [14]:
def ndcg(labels, predictions):
    # Calculate Normalized Discounted Cumulative Gain (NDCG)
    labels = np.asarray(labels)
    predictions = np.asarray(predictions)
    sorted_labels = labels[np.argsort(predictions)[::-1]]
    
    # Calculate the Discounted Cumulative Gain (DCG)
    dcg = np.sum((2 ** sorted_labels - 1) / np.log2(np.arange(2, len(labels) + 2)))
    # Sort the labels in descending order to calculate the ideal DCG
    ideal_labels = np.sort(labels)[::-1]
    ideal_dcg = np.sum((2 ** ideal_labels - 1) / np.log2(np.arange(2, len(labels) + 2)))
    
    # Calculate NDCG by normalizing DCG by the ideal DCG
    ndcg = dcg / ideal_dcg if ideal_dcg != 0 else 0
    return ndcg

pred_validation = model.predict(X_validation, raw_score=False)
print(ndcg(y_validation, pred_validation))
#for label, prediction in zip(y_validation, pred_validation):
    #print(f"Label {label}: {prediction}")

1.0626263396483973
